From f4eedcf633f455e8ac11c58dcc4d3d536bfd42d6 Mon Sep 17 00:00:00 2001 From: "PVS.NarasimhaRao" Date: Mon, 1 Jun 2026 19:52:56 +0800 Subject: [PATCH 01/75] anolis: x86/sev: Restore IBPB-on-Entry SNP feature lost in Dump SEV_STATUS ANBZ: #34453 Commit 7016fad256fa ("x86/sev: Dump SEV_STATUS") accidentally removed MSR_AMD64_SNP_IBPB_ON_ENTRY_BIT (bit 23) and reset MSR_AMD64_SNP_RESV_BIT from 24 to 18, making bit 23 appear reserved and breaking SEV-SNP guests on Zen5 hypervisors that enable IBPB-on-Entry. Restore MSR_AMD64_SNP_IBPB_ON_ENTRY_BIT (bit 23) and move MSR_AMD64_SNP_RESV_BIT back to 24. Since sev_show_status() iterates the SEV_STATUS bits up to MSR_AMD64_SNP_RESV_BIT, also add the matching "IBPBOnEntry" entry to sev_status_feat_names[] so the array covers the restored range. Without it, sev_show_status() would index past the end of sev_status_feat_names[] (bits 18-23) and trigger an out-of-bounds read during boot status printing. Fixes: 7016fad256fa ("x86/sev: Dump SEV_STATUS") Signed-off-by: PVS.NarasimhaRao --- arch/x86/include/asm/msr-index.h | 4 +++- arch/x86/kernel/sev.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e7d4ec325bc9..b7a2480d2963 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -718,7 +718,9 @@ #define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) #define MSR_AMD64_SNP_SMT_PROT_BIT 17 #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) -#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_IBPB_ON_ENTRY_BIT 23 +#define MSR_AMD64_SNP_IBPB_ON_ENTRY BIT_ULL(MSR_AMD64_SNP_IBPB_ON_ENTRY_BIT) +#define MSR_AMD64_SNP_RESV_BIT 24 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index b469b6b0706b..dcb13f3e43a6 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -79,6 +79,7 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", + [MSR_AMD64_SNP_IBPB_ON_ENTRY_BIT] = "IBPBOnEntry", }; /* For early boot hypervisor communication in SEV-ES enabled guests */ -- Gitee From 267fe901bacc7c5a03bfeafce01664eda3d51742 Mon Sep 17 00:00:00 2001 From: "PVS.NarasimhaRao" Date: Tue, 2 Jun 2026 12:42:09 +0800 Subject: [PATCH 02/75] anolis: crypto: ccp: Restore Hygon PSP mutex handling in SEV/SNP shutdown path ANBZ: #34453 Make sev_firmware_shutdown() take the shared Hygon PSP mutex when is_vendor_hygon() && psp_mutex_enabled so the shutdown path serializes against the other PSP command paths just like sev_do_cmd(), sev_platform_init() and the CSV ioctls, instead of relying only on sev_cmd_mutex. The panic shutdown notifier snp_shutdown_on_panic() must serialize the same way. It previously only checked sev_cmd_mutex before calling __sev_firmware_shutdown(), which on Hygon with psp_mutex_enabled does not detect an in-flight PSP command (those serialize on the PSP mailbox mutex, not sev_cmd_mutex) and could therefore race with one during a panic. Check the PSP mailbox mutex via psp_mutex_trylock() in that case and bail out if it cannot be acquired, matching the rest of the command paths. Fixes: 2c81734c0543 ("crypto: ccp: Add panic notifier for SEV/SNP firmware shutdown on kdump") Signed-off-by: PVS.NarasimhaRao --- drivers/crypto/ccp/sev-dev.c | 39 +++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index d0606b77184e..8e7024807dc6 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -2397,9 +2397,22 @@ static void __sev_firmware_shutdown(struct sev_device *sev, bool panic) static void sev_firmware_shutdown(struct sev_device *sev) { - mutex_lock(&sev_cmd_mutex); + int mutex_enabled = READ_ONCE(hygon_psp_hooks.psp_mutex_enabled); + + if (is_vendor_hygon() && mutex_enabled) { + if (psp_mutex_lock_timeout(&hygon_psp_hooks.psp_misc->data_pg_aligned->mb_mutex, + PSP_MUTEX_TIMEOUT) != 1) + return; /* void return - bail out; cannot propagate -EBUSY */ + } else { + mutex_lock(&sev_cmd_mutex); + } + __sev_firmware_shutdown(sev, false); - mutex_unlock(&sev_cmd_mutex); + + if (is_vendor_hygon() && mutex_enabled) + psp_mutex_unlock(&hygon_psp_hooks.psp_misc->data_pg_aligned->mb_mutex); + else + mutex_unlock(&sev_cmd_mutex); } void sev_dev_destroy(struct psp_device *psp) @@ -2421,18 +2434,30 @@ static int snp_shutdown_on_panic(struct notifier_block *nb, unsigned long reason, void *arg) { struct sev_device *sev = psp_master->sev_data; + int mutex_enabled = READ_ONCE(hygon_psp_hooks.psp_mutex_enabled); /* - * If sev_cmd_mutex is already acquired, then it's likely - * another PSP command is in flight and issuing a shutdown - * would fail in unexpected ways. Rather than create even - * more confusion during a panic, just bail out here. + * If a PSP command is already in flight, then issuing a shutdown + * would fail in unexpected ways. Rather than create even more + * confusion during a panic, just bail out here. + * + * On Hygon with the PSP mailbox mutex enabled, in-flight commands + * serialize on that mutex rather than sev_cmd_mutex, so check it + * here to match sev_do_cmd()/sev_firmware_shutdown() and avoid + * racing with an in-flight command. */ - if (mutex_is_locked(&sev_cmd_mutex)) + if (is_vendor_hygon() && mutex_enabled) { + if (psp_mutex_trylock(&hygon_psp_hooks.psp_misc->data_pg_aligned->mb_mutex) != 1) + return NOTIFY_DONE; + } else if (mutex_is_locked(&sev_cmd_mutex)) { return NOTIFY_DONE; + } __sev_firmware_shutdown(sev, true); + if (is_vendor_hygon() && mutex_enabled) + psp_mutex_unlock(&hygon_psp_hooks.psp_misc->data_pg_aligned->mb_mutex); + return NOTIFY_DONE; } -- Gitee From b76e4799b4668eff614a30813fc9d8e3ed12c495 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 24 Apr 2024 10:57:57 -0500 Subject: [PATCH 03/75] x86/sev: Shorten struct name snp_secrets_page_layout to snp_secrets_page ANBZ: #34453 commit 1e52550729dafb41b12652a985d3df6cfa99cb88 upstream. Ending a struct name with "layout" is a little redundant, so shorten the snp_secrets_page_layout name to just snp_secrets_page. No functional change. [ bp: Rename the local pointer to "secrets" too for more clarity. ] Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/bc8d58302c6ab66c3beeab50cce3ec2c6bd72d6c.1713974291.git.thomas.lendacky@amd.com Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/include/asm/sev.h | 2 +- arch/x86/kernel/sev.c | 6 +++--- drivers/virt/coco/sev-guest/sev-guest.c | 28 ++++++++++++------------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 4bec4cfb775a..25926271e4ce 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -139,7 +139,7 @@ struct secrets_os_area { #define VMPCK_KEY_LEN 32 /* See the SNP spec version 0.9 for secrets page format */ -struct snp_secrets_page_layout { +struct snp_secrets_page { u32 version; u32 imien : 1, rsvd1 : 31; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index dcb13f3e43a6..8ae582b984f2 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -650,7 +650,7 @@ static u64 __init get_secrets_page(void) static u64 __init get_snp_jump_table_addr(void) { - struct snp_secrets_page_layout *layout; + struct snp_secrets_page *secrets; void __iomem *mem; u64 pa, addr; @@ -664,9 +664,9 @@ static u64 __init get_snp_jump_table_addr(void) return 0; } - layout = (__force struct snp_secrets_page_layout *)mem; + secrets = (__force struct snp_secrets_page *)mem; - addr = layout->os_area.ap_jump_table_pa; + addr = secrets->os_area.ap_jump_table_pa; iounmap(mem); return addr; diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index bc564adcf499..8ab4be41a963 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -59,7 +59,7 @@ struct snp_guest_dev { */ struct snp_guest_msg secret_request, secret_response; - struct snp_secrets_page_layout *layout; + struct snp_secrets_page *secrets; struct snp_req_data input; union { struct snp_report_req report; @@ -743,26 +743,26 @@ static const struct file_operations snp_guest_fops = { .unlocked_ioctl = snp_guest_ioctl, }; -static u8 *get_vmpck(int id, struct snp_secrets_page_layout *layout, u32 **seqno) +static u8 *get_vmpck(int id, struct snp_secrets_page *secrets, u32 **seqno) { u8 *key = NULL; switch (id) { case 0: - *seqno = &layout->os_area.msg_seqno_0; - key = layout->vmpck0; + *seqno = &secrets->os_area.msg_seqno_0; + key = secrets->vmpck0; break; case 1: - *seqno = &layout->os_area.msg_seqno_1; - key = layout->vmpck1; + *seqno = &secrets->os_area.msg_seqno_1; + key = secrets->vmpck1; break; case 2: - *seqno = &layout->os_area.msg_seqno_2; - key = layout->vmpck2; + *seqno = &secrets->os_area.msg_seqno_2; + key = secrets->vmpck2; break; case 3: - *seqno = &layout->os_area.msg_seqno_3; - key = layout->vmpck3; + *seqno = &secrets->os_area.msg_seqno_3; + key = secrets->vmpck3; break; default: break; @@ -897,8 +897,8 @@ static void unregister_sev_tsm(void *data) static int __init sev_guest_probe(struct platform_device *pdev) { - struct snp_secrets_page_layout *layout; struct sev_guest_platform_data *data; + struct snp_secrets_page *secrets; struct device *dev = &pdev->dev; struct snp_guest_dev *snp_dev; struct miscdevice *misc; @@ -916,7 +916,7 @@ static int __init sev_guest_probe(struct platform_device *pdev) if (!mapping) return -ENODEV; - layout = (__force void *)mapping; + secrets = (__force void *)mapping; ret = -ENOMEM; snp_dev = devm_kzalloc(&pdev->dev, sizeof(struct snp_guest_dev), GFP_KERNEL); @@ -924,7 +924,7 @@ static int __init sev_guest_probe(struct platform_device *pdev) goto e_unmap; ret = -EINVAL; - snp_dev->vmpck = get_vmpck(vmpck_id, layout, &snp_dev->os_area_msg_seqno); + snp_dev->vmpck = get_vmpck(vmpck_id, secrets, &snp_dev->os_area_msg_seqno); if (!snp_dev->vmpck) { dev_err(dev, "invalid vmpck id %d\n", vmpck_id); goto e_unmap; @@ -938,7 +938,7 @@ static int __init sev_guest_probe(struct platform_device *pdev) platform_set_drvdata(pdev, snp_dev); snp_dev->dev = dev; - snp_dev->layout = layout; + snp_dev->secrets = secrets; /* Allocate the shared page used for the request and response message. */ snp_dev->request = alloc_shared_pages(dev, sizeof(struct snp_guest_msg)); -- Gitee From 447c178e3c811882ffbea015d041c3a9e6964cbb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 24 Apr 2024 10:57:58 -0500 Subject: [PATCH 04/75] x86/sev: Rename snp_init() in boot/compressed/sev.c ANBZ: #34453 commit 88ed43d32beb1ef3c06164c52b1c6ced47b5988b upstream. The snp_init() function in boot/compressed/sev.c is local to that file, is not called from outside of the file and is independent of the snp_init() function in kernel/sev.c. Change the name to better differentiate when each function is used. Move the renamed snp_init() and related functions up in the file to avoid having to add a forward declaration and make the function static. No functional change. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/afda29585c2724b9698003f24cefa77eb35f4ffb.1713974291.git.thomas.lendacky@amd.com Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/boot/compressed/sev.c | 162 ++++++++++++++++----------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 6affff6fe8e0..889b76c9b0b4 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -383,6 +383,85 @@ void snp_check_features(void) } } +/* Search for Confidential Computing blob in the EFI config table. */ +static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp) +{ + unsigned long cfg_table_pa; + unsigned int cfg_table_len; + int ret; + + ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len); + if (ret) + return NULL; + + return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa, + cfg_table_len, + EFI_CC_BLOB_GUID); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the boot kernel + * by firmware/bootloader in the following ways: + * + * - via an entry in the EFI config table + * - via a setup_data structure, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + cc_info = find_cc_blob_efi(bp); + if (cc_info) + goto found_cc_info; + + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + return cc_info; +} + +/* + * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks + * will verify the SNP CPUID/MSR bits. + */ +static bool early_snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + /* + * If a SNP-specific Confidential Computing blob is present, then + * firmware/bootloader have indicated SNP support. Verifying this + * involves CPUID checks which will be more reliable if the SNP + * CPUID table is used. See comments over snp_setup_cpuid_table() for + * more details. + */ + setup_cpuid_table(cc_info); + + /* + * Pass run-time kernel a pointer to CC info via boot_params so EFI + * config table doesn't need to be searched again during early startup + * phase. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + /* * sev_check_cpu_support - Check for SEV support in the CPU capabilities * @@ -433,7 +512,7 @@ void sev_enable(struct boot_params *bp) bp->cc_blob_address = 0; /* - * Do an initial SEV capability check before snp_init() which + * Do an initial SEV capability check before early_snp_init() which * loads the CPUID page and the same checks afterwards are done * without the hypervisor and are trustworthy. * @@ -448,7 +527,7 @@ void sev_enable(struct boot_params *bp) * Setup/preliminary detection of SNP. This will be sanity-checked * against CPUID/MSR values later. */ - snp = snp_init(bp); + snp = early_snp_init(bp); /* Now repeat the checks with the SNP CPUID table. */ @@ -505,85 +584,6 @@ u64 sev_get_status(void) return m.q; } -/* Search for Confidential Computing blob in the EFI config table. */ -static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp) -{ - unsigned long cfg_table_pa; - unsigned int cfg_table_len; - int ret; - - ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len); - if (ret) - return NULL; - - return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa, - cfg_table_len, - EFI_CC_BLOB_GUID); -} - -/* - * Initial set up of SNP relies on information provided by the - * Confidential Computing blob, which can be passed to the boot kernel - * by firmware/bootloader in the following ways: - * - * - via an entry in the EFI config table - * - via a setup_data structure, as defined by the Linux Boot Protocol - * - * Scan for the blob in that order. - */ -static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) -{ - struct cc_blob_sev_info *cc_info; - - cc_info = find_cc_blob_efi(bp); - if (cc_info) - goto found_cc_info; - - cc_info = find_cc_blob_setup_data(bp); - if (!cc_info) - return NULL; - -found_cc_info: - if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); - - return cc_info; -} - -/* - * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks - * will verify the SNP CPUID/MSR bits. - */ -bool snp_init(struct boot_params *bp) -{ - struct cc_blob_sev_info *cc_info; - - if (!bp) - return false; - - cc_info = find_cc_blob(bp); - if (!cc_info) - return false; - - /* - * If a SNP-specific Confidential Computing blob is present, then - * firmware/bootloader have indicated SNP support. Verifying this - * involves CPUID checks which will be more reliable if the SNP - * CPUID table is used. See comments over snp_setup_cpuid_table() for - * more details. - */ - setup_cpuid_table(cc_info); - - /* - * Pass run-time kernel a pointer to CC info via boot_params so EFI - * config table doesn't need to be searched again during early startup - * phase. - */ - bp->cc_blob_address = (u32)(unsigned long)cc_info; - - return true; -} - void sev_prep_identity_maps(unsigned long top_level_pgt) { /* -- Gitee From 8b3f5ba8098ee9c855d4f3e3126801331a2f10c7 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 24 Apr 2024 10:57:59 -0500 Subject: [PATCH 05/75] x86/sev: Make the VMPL0 checking more straight forward ANBZ: #34453 commit e2f4c8c319abd1afbedb7a31877cb569265db1b4 upstream. Currently, the enforce_vmpl0() function uses a set argument when modifying the VMPL1 permissions used to test for VMPL0. If the guest is not running at VMPL0, the guest self-terminates. The function is just a wrapper for a fixed RMPADJUST function. Eliminate the function and perform the RMPADJUST directly. No functional change. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/ed01ddf04bfb475596b24b634fd26cffaa85173a.1713974291.git.thomas.lendacky@amd.com Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/boot/compressed/sev.c | 35 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 889b76c9b0b4..4201ebc3fa8f 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -304,26 +304,6 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } -static void enforce_vmpl0(void) -{ - u64 attrs; - int err; - - /* - * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically - * higher) privilege level. Here, clear the VMPL1 permission mask of the - * GHCB page. If the guest is not running at VMPL0, this will fail. - * - * If the guest is running at VMPL0, it will succeed. Even if that operation - * modifies permission bits, it is still ok to do so currently because Linux - * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks - * changing is a don't-care. - */ - attrs = 1; - if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); -} - /* * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need * guest side implementation for proper functioning of the guest. If any @@ -558,7 +538,20 @@ void sev_enable(struct boot_params *bp) if (!(get_hv_features() & GHCB_HV_FT_SNP)) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); - enforce_vmpl0(); + /* + * Enforce running at VMPL0. + * + * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically + * higher) privilege level. Here, clear the VMPL1 permission mask of the + * GHCB page. If the guest is not running at VMPL0, this will fail. + * + * If the guest is running at VMPL0, it will succeed. Even if that operation + * modifies permission bits, it is still ok to do so currently because Linux + * SNP guests running at VMPL0 only run at VMPL0, so VMPL1 or higher + * permission mask changes are a don't-care. + */ + if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); } if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) -- Gitee From 8979daae5f0d6887a9b9db9689bd242b816b2bf4 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 10 Oct 2023 17:52:19 +0300 Subject: [PATCH 06/75] x86/sev: Move sev_setup_arch() to mem_encrypt.c ANBZ: #34453 commit 6e74b125155dc8c747d76fb45d8e6d20e9e4fb4d upstream. Since commit: 4d96f9109109b ("x86/sev: Replace occurrences of sev_active() with cc_platform_has()") ... the SWIOTLB bounce buffer size adjustment and restricted virtio memory setting also inadvertently apply to TDX: the code is using cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) as a gatekeeping condition, which is also true for TDX, and this is also what we want. To reflect this, move the corresponding code to generic mem_encrypt.c. No functional changes intended. Signed-off-by: Alexander Shishkin Signed-off-by: Ingo Molnar Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20231010145220.3960055-2-alexander.shishkin@linux.intel.com Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/include/asm/mem_encrypt.h | 4 ++-- arch/x86/kernel/setup.c | 2 +- arch/x86/mm/mem_encrypt.c | 34 +++++++++++++++++++++++++++++ arch/x86/mm/mem_encrypt_amd.c | 35 ------------------------------ 4 files changed, 37 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 9816db501ea4..0291a1467441 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -20,8 +20,10 @@ struct boot_params; #ifdef CONFIG_X86_MEM_ENCRYPT void __init mem_encrypt_init(void); +void __init mem_encrypt_setup_arch(void); #else static inline void mem_encrypt_init(void) { } +static inline void __init mem_encrypt_setup_arch(void) { } #endif #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -44,7 +46,6 @@ void __init sme_map_bootdata(char *real_mode_data); void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); -void __init sev_setup_arch(void); void sme_encrypt_kernel(struct boot_params *bp); void sme_enable(struct boot_params *bp); @@ -79,7 +80,6 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { } static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } -static inline void __init sev_setup_arch(void) { } static inline void sme_encrypt_kernel(struct boot_params *bp) { } static inline void sme_enable(struct boot_params *bp) { } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e9d4292ac387..c9ae24996dc0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1171,7 +1171,7 @@ void __init setup_arch(char **cmdline_p) * Needs to run after memblock setup because it needs the physical * memory size. */ - sev_setup_arch(); + mem_encrypt_setup_arch(); cc_random_init(); efi_fake_memmap(); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 183c324cfbd0..a4ad430a0add 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -102,3 +103,36 @@ void __init mem_encrypt_init(void) print_mem_encrypt_feature_info(); } + +void __init mem_encrypt_setup_arch(void) +{ + phys_addr_t total_mem = memblock_phys_mem_size(); + unsigned long size; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + /* + * For SEV and TDX, all DMA has to occur via shared/unencrypted pages. + * Kernel uses SWIOTLB to make this happen without changing device + * drivers. However, depending on the workload being run, the + * default 64MB of SWIOTLB may not be enough and SWIOTLB may + * run out of buffers for DMA, resulting in I/O errors and/or + * performance degradation especially with high I/O workloads. + * + * Adjust the default size of SWIOTLB using a percentage of guest + * memory for SWIOTLB buffers. Also, as the SWIOTLB bounce buffer + * memory is allocated from low memory, ensure that the adjusted size + * is within the limits of low available memory. + * + * The percentage of guest memory used here for SWIOTLB buffers + * is more of an approximation of the static adjustment which + * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% + */ + size = total_mem * 6 / 100; + size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); + swiotlb_adjust_size(size); + + /* Set restricted memory access for virtio. */ + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); +} diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 394d79998101..22f535bd2554 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -217,40 +216,6 @@ void __init sme_map_bootdata(char *real_mode_data) __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); } -void __init sev_setup_arch(void) -{ - phys_addr_t total_mem = memblock_phys_mem_size(); - unsigned long size; - - if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - return; - - /* - * For SEV, all DMA has to occur via shared/unencrypted pages. - * SEV uses SWIOTLB to make this happen without changing device - * drivers. However, depending on the workload being run, the - * default 64MB of SWIOTLB may not be enough and SWIOTLB may - * run out of buffers for DMA, resulting in I/O errors and/or - * performance degradation especially with high I/O workloads. - * - * Adjust the default size of SWIOTLB for SEV guests using - * a percentage of guest memory for SWIOTLB buffers. - * Also, as the SWIOTLB bounce buffer memory is allocated - * from low memory, ensure that the adjusted size is within - * the limits of low available memory. - * - * The percentage of guest memory used here for SWIOTLB buffers - * is more of an approximation of the static adjustment which - * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% - */ - size = total_mem * 6 / 100; - size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); - swiotlb_adjust_size(size); - - /* Set restricted memory access for virtio. */ - virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); -} - static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) { unsigned long pfn = 0; -- Gitee From 70790f02c538ef36eed2c8bc965a4048f97fce7b Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 22 Mar 2024 13:15:08 +0800 Subject: [PATCH 07/75] x86/kexec: Do not update E820 kexec table for setup_data ANBZ: #34453 commit fc7f27cda843ce294c71767d42b9d8abd015d7cb upstream. crashkernel reservation failed on a Thinkpad t440s laptop recently. Actually the memblock reservation succeeded, but later insert_resource() failed. Test steps: kexec load -> /* make sure add crashkernel param eg. crashkernel=160M */ kexec reboot -> dmesg|grep "crashkernel reserved"; crashkernel memory range like below reserved successfully: 0x00000000d0000000 - 0x00000000da000000 But no such "Crash kernel" region in /proc/iomem The background story: Currently the E820 code reserves setup_data regions for both the current kernel and the kexec kernel, and it inserts them into the resources list. Before the kexec kernel reboots nobody passes the old setup_data, and kexec only passes fresh SETUP_EFI/SETUP_IMA/SETUP_RNG_SEED if needed. Thus the old setup data memory is not used at all. Due to old kernel updates the kexec e820 table as well so kexec kernel sees them as E820_TYPE_RESERVED_KERN regions, and later the old setup_data regions are inserted into resources list in the kexec kernel by e820__reserve_resources(). Note, due to no setup_data is passed in for those old regions they are not early reserved (by function early_reserve_memory), and the crashkernel memblock reservation will just treat them as usable memory and it could reserve the crashkernel region which overlaps with the old setup_data regions. And just like the bug I noticed here, kdump insert_resource failed because e820__reserve_resources has added the overlapped chunks in /proc/iomem already. Finally, looking at the code, the old setup_data regions are not used at all as no setup_data is passed in by the kexec boot loader. Although something like SETUP_PCI etc could be needed, kexec should pass the info as new setup_data so that kexec kernel can take care of them. This should be taken care of in other separate patches if needed. Thus drop the useless buggy code here. Signed-off-by: Dave Young Signed-off-by: Ingo Molnar Cc: Jiri Bohac Cc: Eric DeVolder Cc: Baoquan He Cc: Ard Biesheuvel Cc: Kees Cook Cc: "Kirill A. Shutemov" Link: https://lore.kernel.org/r/Zf0T3HCG-790K-pZ@darkstar.users.ipa.redhat.com Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kernel/e820.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 77188777797c..94462c9a7871 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1015,17 +1015,6 @@ void __init e820__reserve_setup_data(void) e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - /* - * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by - * kexec and do not need to be reserved. - */ - if (data->type != SETUP_EFI && - data->type != SETUP_IMA && - data->type != SETUP_RNG_SEED) - e820__range_update_kexec(pa_data, - sizeof(*data) + data->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - if (data->type == SETUP_INDIRECT) { len += data->len; early_memunmap(data, sizeof(*data)); @@ -1037,12 +1026,9 @@ void __init e820__reserve_setup_data(void) indirect = (struct setup_indirect *)data->data; - if (indirect->type != SETUP_INDIRECT) { + if (indirect->type != SETUP_INDIRECT) e820__range_update(indirect->addr, indirect->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(indirect->addr, indirect->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - } } pa_data = pa_next; @@ -1050,7 +1036,6 @@ void __init e820__reserve_setup_data(void) } e820__update_table(e820_table); - e820__update_table(e820_table_kexec); pr_info("extended physical RAM map:\n"); e820__print_table("reserve setup_data"); -- Gitee From 20f87a9491439518dd3244954c0c36c34fb16ad4 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Fri, 26 Apr 2024 00:41:56 +0000 Subject: [PATCH 08/75] x86/e820: Add a new e820 table update helper ANBZ: #34453 commit d6d85ac15cce4dcf02cf8c96cb970562be6a3529 upstream. Add a new API helper e820__range_update_table() with which to update an arbitrary e820 table. Move all current users of e820__range_update_kexec() to this new helper. [ bp: Massage. ] Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/b726af213ad55053f8a7a1e793b01bb3f1ca9dd5.1714090302.git.ashish.kalra@amd.com Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/include/asm/e820/api.h | 1 + arch/x86/kernel/e820.c | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index e8f58ddd06d9..2e74a7f0e935 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -17,6 +17,7 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); extern void e820__range_add (u64 start, u64 size, enum e820_type type); extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); +extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern void e820__print_table(char *who); extern int e820__update_table(struct e820_table *table); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 94462c9a7871..658c8dc8aa71 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -532,9 +532,10 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, + enum e820_type old_type, enum e820_type new_type) { - return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); + return __e820__range_update(t, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ @@ -805,7 +806,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = memblock_phys_alloc(size, align); if (addr) { - e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); e820__update_table_kexec(); } -- Gitee From 409736f0d1c54dae6a7b1c9b8d0f13137aab72ff Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Fri, 26 Apr 2024 00:43:18 +0000 Subject: [PATCH 09/75] x86/sev: Add callback to apply RMP table fixups for kexec ANBZ: #34453 commit 400fea4b9651adf5d7ebd5d71e905f34f4e4e493 upstream. Handle cases where the RMP table placement in the BIOS is not 2M aligned and the kexec-ed kernel could try to allocate from within that chunk which then causes a fatal RMP fault. The kexec failure is illustrated below: SEV-SNP: RMP table physical range [0x0000007ffe800000 - 0x000000807f0fffff] BIOS-provided physical RAM map: BIOS-e820: [mem 0x0000000000000000-0x000000000008efff] usable BIOS-e820: [mem 0x000000000008f000-0x000000000008ffff] ACPI NVS ... BIOS-e820: [mem 0x0000004080000000-0x0000007ffe7fffff] usable BIOS-e820: [mem 0x0000007ffe800000-0x000000807f0fffff] reserved BIOS-e820: [mem 0x000000807f100000-0x000000807f1fefff] usable As seen here in the e820 memory map, the end range of the RMP table is not aligned to 2MB and not reserved but it is usable as RAM. Subsequently, kexec -s (KEXEC_FILE_LOAD syscall) loads it's purgatory code and boot_param, command line and other setup data into this RAM region as seen in the kexec logs below, which leads to fatal RMP fault during kexec boot. Loaded purgatory at 0x807f1fa000 Loaded boot_param, command line and misc at 0x807f1f8000 bufsz=0x1350 memsz=0x2000 Loaded 64bit kernel at 0x7ffae00000 bufsz=0xd06200 memsz=0x3894000 Loaded initrd at 0x7ff6c89000 bufsz=0x4176014 memsz=0x4176014 E820 memmap: 0000000000000000-000000000008efff (1) 000000000008f000-000000000008ffff (4) 0000000000090000-000000000009ffff (1) ... 0000004080000000-0000007ffe7fffff (1) 0000007ffe800000-000000807f0fffff (2) 000000807f100000-000000807f1fefff (1) 000000807f1ff000-000000807fffffff (2) nr_segments = 4 segment[0]: buf=0x00000000e626d1a2 bufsz=0x4000 mem=0x807f1fa000 memsz=0x5000 segment[1]: buf=0x0000000029c67bd6 bufsz=0x1350 mem=0x807f1f8000 memsz=0x2000 segment[2]: buf=0x0000000045c60183 bufsz=0xd06200 mem=0x7ffae00000 memsz=0x3894000 segment[3]: buf=0x000000006e54f08d bufsz=0x4176014 mem=0x7ff6c89000 memsz=0x4177000 kexec_file_load: type:0, start:0x807f1fa150 head:0x1184d0002 flags:0x0 Check if RMP table start and end physical range in the e820 tables are not aligned to 2MB and in that case map this range to reserved in all the three e820 tables. [ bp: Massage. ] Fixes: c3b86e61b756 ("x86/cpufeatures: Enable/unmask SEV-SNP CPU feature") Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/df6e995ff88565262c2c7c69964883ff8aa6fc30.1714090302.git.ashish.kalra@amd.com Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/include/asm/sev.h | 2 ++ arch/x86/mm/mem_encrypt.c | 7 +++++++ arch/x86/virt/svm/sev.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 25926271e4ce..d33c1f1b9e75 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -268,6 +268,7 @@ int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immut int rmp_make_shared(u64 pfn, enum pg_level level); void snp_leak_pages(u64 pfn, unsigned int npages); void kdump_sev_callback(void); +void snp_fixup_e820_tables(void); #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } @@ -281,6 +282,7 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} static inline void kdump_sev_callback(void) { } +static inline void snp_fixup_e820_tables(void) {} #endif #endif diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index a4ad430a0add..064cd8bbdbdd 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -109,6 +109,13 @@ void __init mem_encrypt_setup_arch(void) phys_addr_t total_mem = memblock_phys_mem_size(); unsigned long size; + /* + * Do RMP table fixups after the e820 tables have been setup by + * e820__memory_setup(). + */ + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + snp_fixup_e820_tables(); + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index ab0e8448bb6e..0ae10535c699 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -163,6 +163,42 @@ bool snp_probe_rmptable_info(void) return true; } +static void __init __snp_fixup_e820_tables(u64 pa) +{ + if (IS_ALIGNED(pa, PMD_SIZE)) + return; + + /* + * Handle cases where the RMP table placement by the BIOS is not + * 2M aligned and the kexec kernel could try to allocate + * from within that chunk which then causes a fatal RMP fault. + * + * The e820_table needs to be updated as it is converted to + * kernel memory resources and used by KEXEC_FILE_LOAD syscall + * to load kexec segments. + * + * The e820_table_firmware needs to be updated as it is exposed + * to sysfs and used by the KEXEC_LOAD syscall to load kexec + * segments. + * + * The e820_table_kexec needs to be updated as it passed to + * the kexec-ed kernel. + */ + pa = ALIGN_DOWN(pa, PMD_SIZE); + if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) { + pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); + e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + } +} + +void __init snp_fixup_e820_tables(void) +{ + __snp_fixup_e820_tables(probed_rmp_base); + __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); +} + /* * Do the necessary preparations which are verified by the firmware as * described in the SNP_INIT_EX firmware command description in the SNP -- Gitee From 926d0fbc15d25a2add94763f3bf2b7ae3b2a3e6c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Feb 2024 18:41:33 -0800 Subject: [PATCH 10/75] KVM: x86: Remove separate "bit" defines for page fault error code masks ANBZ: #34453 commit 63b6206e2f9a4ca756262a8bc20fb869d6db52be upstream. Open code the bit number directly in the PFERR_* masks and drop the intermediate PFERR_*_BIT defines, as having to bounce through two macros just to see which flag corresponds to which bit is quite annoying, as is having to define two macros just to add recognition of a new flag. Use ternary operator to derive the bit in permission_fault(), the one function that actually needs the bit number as part of clever shifting to avoid conditional branches. Generally the compiler is able to turn it into a conditional move, and if not it's not really a big deal. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Message-ID: <20240228024147.41573-3-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/include/asm/kvm_host.h | 32 ++++++++++---------------------- arch/x86/kvm/mmu.h | 5 ++--- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8fb2e23ff1a0..21239398163a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -246,28 +246,16 @@ enum x86_intercept_stage; KVM_GUESTDBG_INJECT_DB | \ KVM_GUESTDBG_BLOCKIRQ) - -#define PFERR_PRESENT_BIT 0 -#define PFERR_WRITE_BIT 1 -#define PFERR_USER_BIT 2 -#define PFERR_RSVD_BIT 3 -#define PFERR_FETCH_BIT 4 -#define PFERR_PK_BIT 5 -#define PFERR_SGX_BIT 15 -#define PFERR_GUEST_FINAL_BIT 32 -#define PFERR_GUEST_PAGE_BIT 33 -#define PFERR_IMPLICIT_ACCESS_BIT 48 - -#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) -#define PFERR_USER_MASK BIT(PFERR_USER_BIT) -#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) -#define PFERR_PK_MASK BIT(PFERR_PK_BIT) -#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) -#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) -#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) -#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) +#define PFERR_PRESENT_MASK BIT(0) +#define PFERR_WRITE_MASK BIT(1) +#define PFERR_USER_MASK BIT(2) +#define PFERR_RSVD_MASK BIT(3) +#define PFERR_FETCH_MASK BIT(4) +#define PFERR_PK_MASK BIT(5) +#define PFERR_SGX_MASK BIT(15) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(32) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(33) +#define PFERR_IMPLICIT_ACCESS BIT_ULL(48) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ade33a54306d..62ef27314c13 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -215,7 +215,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, */ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; - int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; + int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1; u32 errcode = PFERR_PRESENT_MASK; bool fault; @@ -236,8 +236,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ - offset = (pfec & ~1) + - ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); + offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0); pkru_bits &= mmu->pkru_mask >> offset; errcode |= -pkru_bits & PFERR_PK_MASK; -- Gitee From 13d5aa29418c054077bfbac3407305a1f2dd9c75 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Feb 2024 18:41:34 -0800 Subject: [PATCH 11/75] KVM: x86: Define more SEV+ page fault error bits/flags for #NPF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #34453 commit 9b62e03e192ce9300608f9be69be9854a166eae3 upstream. Define more #NPF error code flags that are relevant to SEV+ (mostly SNP) guests, as specified by the APM: * Bit 31 (RMP): Set to 1 if the fault was caused due to an RMP check or a VMPL check failure, 0 otherwise. * Bit 34 (ENC): Set to 1 if the guest’s effective C-bit was 1, 0 otherwise. * Bit 35 (SIZEM): Set to 1 if the fault was caused by a size mismatch between PVALIDATE or RMPADJUST and the RMP, 0 otherwise. * Bit 36 (VMPL): Set to 1 if the fault was caused by a VMPL permission check failure, 0 otherwise. Note, the APM is *extremely* misleading, and strongly implies that the above flags can _only_ be set for #NPF exits from SNP guests. That is a lie, as bit 34 (C-bit=1, i.e. was encrypted) can be set when running _any_ flavor of SEV guest on SNP capable hardware. Signed-off-by: Sean Christopherson Message-ID: <20240228024147.41573-4-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/include/asm/kvm_host.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 21239398163a..ff67bf5800b3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -253,8 +253,12 @@ enum x86_intercept_stage; #define PFERR_FETCH_MASK BIT(4) #define PFERR_PK_MASK BIT(5) #define PFERR_SGX_MASK BIT(15) +#define PFERR_GUEST_RMP_MASK BIT_ULL(31) #define PFERR_GUEST_FINAL_MASK BIT_ULL(32) #define PFERR_GUEST_PAGE_MASK BIT_ULL(33) +#define PFERR_GUEST_ENC_MASK BIT_ULL(34) +#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) +#define PFERR_GUEST_VMPL_MASK BIT_ULL(36) #define PFERR_IMPLICIT_ACCESS BIT_ULL(48) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ -- Gitee From b3a7371278a95661b1487d7e855077522b2ce26f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 17 Apr 2024 07:30:29 -0400 Subject: [PATCH 12/75] KVM: x86: Move synthetic PFERR_* sanity checks to SVM's #NPF handler ANBZ: #34453 commit dee281e4b4355286c76d1788dc8e65ec236d6e04 upstream. Move the sanity check that hardware never sets bits that collide with KVM- define synthetic bits from kvm_mmu_page_fault() to npf_interception(), i.e. make the sanity check #NPF specific. The legacy #PF path already WARNs if _any_ of bits 63:32 are set, and the error code that comes from VMX's EPT Violatation and Misconfig is 100% synthesized (KVM morphs VMX's EXIT_QUALIFICATION into error code flags). Add a compile-time assert in the legacy #PF handler to make sure that KVM- define flags are covered by its existing sanity check on the upper bits. Opportunistically add a description of PFERR_IMPLICIT_ACCESS, since we are removing the comment that defined it. Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Reviewed-by: Binbin Wu Message-ID: <20240228024147.41573-8-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/mmu/mmu.c | 14 +++----------- arch/x86/kvm/svm/svm.c | 9 +++++++++ 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ff67bf5800b3..85bac04acb7c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -259,7 +259,13 @@ enum x86_intercept_stage; #define PFERR_GUEST_ENC_MASK BIT_ULL(34) #define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) #define PFERR_GUEST_VMPL_MASK BIT_ULL(36) + +/* + * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks + * when emulating instructions that triggers implicit access. + */ #define PFERR_IMPLICIT_ACCESS BIT_ULL(48) +#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index de555993713f..4bda74a55c9c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4509,6 +4509,9 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, return -EFAULT; #endif + /* Ensure the above sanity check also covers KVM-defined flags. */ + BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); + vcpu->arch.l1tf_flush_l1d = true; if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); @@ -5794,17 +5797,6 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; - /* - * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP - * checks when emulating instructions that triggers implicit access. - * WARN if hardware generates a fault with an error code that collides - * with the KVM-defined value. Clear the flag and continue on, i.e. - * don't terminate the VM, as KVM can't possibly be relying on a flag - * that KVM doesn't know about. - */ - if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS)) - error_code &= ~PFERR_IMPLICIT_ACCESS; - if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1f217791a0d5..da317d46049b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2100,6 +2100,15 @@ static int npf_interception(struct kvm_vcpu *vcpu) u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; + /* + * WARN if hardware generates a fault with an error code that collides + * with KVM-defined sythentic flags. Clear the flags and continue on, + * i.e. don't terminate the VM, as KVM can't possibly be relying on a + * flag that KVM doesn't know about. + */ + if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) + error_code &= ~PFERR_SYNTHETIC_MASK; + trace_kvm_page_fault(vcpu, fault_address, error_code); return kvm_mmu_page_fault(vcpu, fault_address, error_code, static_cpu_has(X86_FEATURE_DECODEASSISTS) ? -- Gitee From 79b7e4a9e1054e2c5b1774ffb5e7e50dab7ef1d9 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 27 Feb 2024 18:41:35 -0800 Subject: [PATCH 13/75] KVM: x86/mmu: Pass full 64-bit error code when handling page faults ANBZ: #34453 commit c9710130ccae3ac3798a5731d2291eeac3a15e20 upstream. Plumb the full 64-bit error code throughout the page fault handling code so that KVM can use the upper 32 bits, e.g. SNP's PFERR_GUEST_ENC_MASK will be used to determine whether or not a fault is private vs. shared. Note, passing the 64-bit error code to FNAME(walk_addr)() does NOT change the behavior of permission_fault() when invoked in the page fault path, as KVM explicitly clears PFERR_IMPLICIT_ACCESS in kvm_mmu_page_fault(). Continue passing '0' from the async #PF worker, as guest_memfd and thus private memory doesn't support async page faults. Signed-off-by: Isaku Yamahata [mdr: drop references/changes on rebase, update commit message] Signed-off-by: Michael Roth [sean: drop truncation in call to FNAME(walk_addr)(), rewrite changelog] Signed-off-by: Sean Christopherson Reviewed-by: Xiaoyao Li Message-ID: <20240228024147.41573-5-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/kvm/mmu/mmu.c | 3 +-- arch/x86/kvm/mmu/mmu_internal.h | 4 ++-- arch/x86/kvm/mmu/mmutrace.h | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4bda74a55c9c..9ef5938bb131 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5808,8 +5808,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err } if (r == RET_PF_INVALID) { - r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, - lower_32_bits(error_code), false, + r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, &emulation_type); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 0669a8a668ca..21f55e8b4dc6 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm) struct kvm_page_fault { /* arguments to kvm_mmu_do_page_fault. */ const gpa_t addr; - const u32 error_code; + const u64 error_code; const bool prefetch; /* Derived from error_code. */ @@ -280,7 +280,7 @@ enum { }; static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefetch, int *emulation_type) + u64 err, bool prefetch, int *emulation_type) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ae86820cef69..195d98bc8de8 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -260,7 +260,7 @@ TRACE_EVENT( TP_STRUCT__entry( __field(int, vcpu_id) __field(gpa_t, cr2_or_gpa) - __field(u32, error_code) + __field(u64, error_code) __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) -- Gitee From a16b3eb4fe40e86e52e055f0ee01c64a3e65843d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Feb 2024 09:28:08 -0500 Subject: [PATCH 14/75] KVM: x86/mmu: Use synthetic page fault error code to indicate private faults ANBZ: #34453 commit b3d5dc629c32f03d6ae0ddff628a67d999b723e0 upstream. Add and use a synthetic, KVM-defined page fault error code to indicate whether a fault is to private vs. shared memory. TDX and SNP have different mechanisms for reporting private vs. shared, and KVM's software-protected VMs have no mechanism at all. Usurp an error code flag to avoid having to plumb another parameter to kvm_mmu_page_fault() and friends. Alternatively, KVM could borrow AMD's PFERR_GUEST_ENC_MASK, i.e. set it for TDX and software-protected VMs as appropriate, but that would require *clearing* the flag for SEV and SEV-ES VMs, which support encrypted memory at the hardware layer, but don't utilize private memory at the KVM layer. Opportunistically add a comment to call out that the logic for software- protected VMs is (and was before this commit) broken for nested MMUs, i.e. for nested TDP, as the GPA is an L2 GPA. Punt on trying to play nice with nested MMUs as there is a _lot_ of functionality that simply doesn't work for software-protected VMs, e.g. all of the paths where KVM accesses guest memory need to be updated to be aware of private vs. shared memory. Signed-off-by: Sean Christopherson Message-Id: <20240228024147.41573-6-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/include/asm/kvm_host.h | 7 ++++++- arch/x86/kvm/mmu/mmu.c | 14 ++++++++++++++ arch/x86/kvm/mmu/mmu_internal.h | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 85bac04acb7c..6981012270a1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -265,7 +265,12 @@ enum x86_intercept_stage; * when emulating instructions that triggers implicit access. */ #define PFERR_IMPLICIT_ACCESS BIT_ULL(48) -#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS) +/* + * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred + * when the guest was accessing private memory. + */ +#define PFERR_PRIVATE_ACCESS BIT_ULL(49) +#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9ef5938bb131..72fb1bfd622f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5800,6 +5800,20 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; + /* + * Except for reserved faults (emulated MMIO is shared-only), set the + * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's + * current attributes, which are the source of truth for such VMs. Note, + * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't + * currently supported nested virtualization (among many other things) + * for software-protected VMs. + */ + if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && + !(error_code & PFERR_RSVD_MASK) && + vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 21f55e8b4dc6..592315bfdf5d 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -298,7 +298,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, - .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT), + .is_private = err & PFERR_PRIVATE_ACCESS, }; int r; -- Gitee From 68e14bc57397d5506fa9649809396256bd1305fa Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 27 Feb 2024 09:11:28 -0500 Subject: [PATCH 15/75] KVM: x86/mmu: check for invalid async page faults involving private memory ANBZ: #34453 commit cd389f50700343774ae6b25b08e16247b3c7fa4c upstream. Right now the error code is not used when an async page fault is completed. This is not a problem in the current code, but it is untidy. For protected VMs, we will also need to check that the page attributes match the current state of the page, because asynchronous page faults can only occur on shared pages (private pages go through kvm_faultin_pfn_private() instead of __gfn_to_pfn_memslot()). Start by piping the error code from kvm_arch_setup_async_pf() to kvm_arch_async_page_ready() via the architecture-specific async page fault data. For now, it can be used to assert that there are no async page faults on private memory. Extracted from a patch by Isaku Yamahata. Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6981012270a1..41ddace7b521 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1863,6 +1863,7 @@ struct kvm_arch_async_pf { gfn_t gfn; unsigned long cr3; bool direct_map; + u64 error_code; }; extern u32 __read_mostly kvm_nr_uret_msrs; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 72fb1bfd622f..2f758f440448 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4245,24 +4245,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu) return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; } -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { struct kvm_arch_async_pf arch; arch.token = alloc_apf_token(vcpu); - arch.gfn = gfn; + arch.gfn = fault->gfn; + arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); - return kvm_setup_async_pf(vcpu, cr2_or_gpa, - kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); + return kvm_setup_async_pf(vcpu, fault->addr, + kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; + if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) + return; + if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || work->wakeup_all) return; @@ -4275,7 +4279,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); + kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL); } static inline u8 kvm_max_level_for_order(int order) @@ -4380,7 +4384,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return RET_PF_RETRY; - } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) { + } else if (kvm_arch_setup_async_pf(vcpu, fault)) { return RET_PF_RETRY; } } -- Gitee From 6fb72af1f8dce110644827d9ffb146e1e3a1c0d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 23 Apr 2024 09:53:26 -0700 Subject: [PATCH 16/75] KVM: x86: Fully re-initialize supported_vm_types on vendor module load ANBZ: #34453 commit c43ad19045d5b7bd3aaf20d9b1f5acb22bdd6a38 upstream. Recompute the entire set of supported VM types when a vendor module is loaded, as preserving supported_vm_types across vendor module unload and reload can result in VM types being incorrectly treated as supported. E.g. if a vendor module is loaded with TDP enabled, unloaded, and then reloaded with TDP disabled, KVM_X86_SW_PROTECTED_VM will be incorrectly retained. Ditto for SEV_VM and SEV_ES_VM and their respective module params in kvm-amd.ko. Fixes: 2a955c4db1dd ("KVM: x86: Add supported_vm_types to kvm_caps") Signed-off-by: Sean Christopherson Reviewed-by: Xiaoyao Li Message-ID: <20240423165328.2853870-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dc8ed6d22374..07ccc0a9e056 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -96,7 +96,6 @@ struct kvm_caps kvm_caps __read_mostly = { .supported_mce_cap = MCG_CTL_P | MCG_SER_P, - .supported_vm_types = BIT(KVM_X86_DEFAULT_VM), }; EXPORT_SYMBOL_GPL(kvm_caps); @@ -9710,6 +9709,8 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (r) goto out_free_percpu; + kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM); + if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; -- Gitee From d14a57d94dea539d45e1f3824551696fd68e6b90 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 1 May 2024 02:10:45 -0500 Subject: [PATCH 17/75] KVM: SEV: Add support to handle AP reset MSR protocol ANBZ: #34453 commit d916f00316b206255164392eeb2aca5f87cdb18a upstream. Add support for AP Reset Hold being invoked using the GHCB MSR protocol, available in version 2 of the GHCB specification. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Signed-off-by: Michael Roth Message-ID: <20240501071048.2208265-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv --- arch/x86/include/asm/sev-common.h | 6 ++-- arch/x86/kvm/svm/sev.c | 56 ++++++++++++++++++++++++++----- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index b463fcbd4b90..01261f7054ad 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -54,8 +54,10 @@ (((unsigned long)fn) << 32)) /* AP Reset Hold */ -#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 -#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 +#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12 +#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0) /* GHCB GPA Register */ #define GHCB_MSR_REG_GPA_REQ 0x012 diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7f21792f958e..406e38ab0dda 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -52,6 +52,10 @@ static bool sev_es_debug_swap_enabled = true; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); static u64 sev_supported_vmsa_features; +#define AP_RESET_HOLD_NONE 0 +#define AP_RESET_HOLD_NAE_EVENT 1 +#define AP_RESET_HOLD_MSR_PROTO 2 + static u8 sev_enc_bit; static DECLARE_RWSEM(sev_deactivate_lock); static DEFINE_MUTEX(sev_bitmap_lock); @@ -2924,6 +2928,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) void sev_es_unmap_ghcb(struct vcpu_svm *svm) { + /* Clear any indication that the vCPU is in a type of AP Reset Hold */ + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; + if (!svm->sev_es.ghcb) return; @@ -3142,6 +3149,22 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_INFO_POS); break; } + case GHCB_MSR_AP_RESET_HOLD_REQ: + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO; + ret = kvm_emulate_ap_reset_hold(&svm->vcpu); + + /* + * Preset the result to a non-SIPI return and then only set + * the result to non-zero when delivering a SIPI. + */ + set_ghcb_msr_bits(svm, 0, + GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, + GHCB_MSR_AP_RESET_HOLD_RESULT_POS); + + set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; @@ -3241,6 +3264,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = 1; break; case SVM_VMGEXIT_AP_HLT_LOOP: + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT; ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { @@ -3479,15 +3503,31 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) return; } - /* - * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where - * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a - * non-zero value. - */ - if (!svm->sev_es.ghcb) - return; + /* Subsequent SIPI */ + switch (svm->sev_es.ap_reset_hold_type) { + case AP_RESET_HOLD_NAE_EVENT: + /* + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. + */ + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); + break; + case AP_RESET_HOLD_MSR_PROTO: + /* + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set GHCB data field to a non-zero value. + */ + set_ghcb_msr_bits(svm, 1, + GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, + GHCB_MSR_AP_RESET_HOLD_RESULT_POS); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); + set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + default: + break; + } } struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index fb54999d2cec..470123aac8ff 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -200,6 +200,7 @@ struct vcpu_sev_es_state { u8 valid_bitmap[16]; struct kvm_host_map ghcb_map; bool received_first_sipi; + unsigned int ap_reset_hold_type; /* SEV-ES scratch area support */ u64 sw_scratch; -- Gitee From 30d8a56ed2eff2afac973aa6dcc143835caa67a0 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 1 May 2024 02:10:46 -0500 Subject: [PATCH 18/75] KVM: SEV: Add GHCB handling for Hypervisor Feature Support requests ANBZ: #34453 commit ae01818398236ad6d8ecd6970334baf0b7c57409 upstream. Version 2 of the GHCB specification introduced advertisement of features that are supported by the Hypervisor. Now that KVM supports version 2 of the GHCB specification, bump the maximum supported protocol version. Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Signed-off-by: Michael Roth Message-ID: <20240501071048.2208265-3-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- arch/x86/include/asm/sev-common.h | 2 ++ arch/x86/kvm/svm/sev.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 01261f7054ad..5a8246dd532f 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -101,6 +101,8 @@ enum psc_op { /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 +#define GHCB_MSR_HV_FT_POS 12 +#define GHCB_MSR_HV_FT_MASK GENMASK_ULL(51, 0) #define GHCB_MSR_HV_FT_RESP_VAL(v) \ /* GHCBData[63:12] */ \ (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 406e38ab0dda..a87b14691744 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -39,6 +39,8 @@ #define GHCB_VERSION_MAX 1ULL #define GHCB_VERSION_MIN 1ULL +#define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP + /* enable/disable SEV support */ static bool sev_enabled = true; module_param_named(sev, sev_enabled, bool, 0444); @@ -2898,6 +2900,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: case SVM_VMGEXIT_UNSUPPORTED_EVENT: + case SVM_VMGEXIT_HV_FEATURES: break; default: reason = GHCB_ERR_INVALID_EVENT; @@ -3165,6 +3168,12 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); break; + case GHCB_MSR_HV_FT_REQ: + set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED, + GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, + GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); + break; case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; @@ -3289,6 +3298,11 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = 1; break; } + case SVM_VMGEXIT_HV_FEATURES: + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED); + + ret = 1; + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", -- Gitee From 714307e9400de3ce70b991c377d8ff654cd34296 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 1 May 2024 02:10:47 -0500 Subject: [PATCH 19/75] KVM: SEV: Add GHCB handling for termination requests ANBZ: #34453 commit 8d1a36e42be6b0864c2c30f94536663b6f08fb48 upstream. GHCB version 2 adds support for a GHCB-based termination request that a guest can issue when it reaches an error state and wishes to inform the hypervisor that it should be terminated. Implement support for that similarly to GHCB MSR-based termination requests that are already available to SEV-ES guests via earlier versions of the GHCB protocol. See 'Termination Request' in the 'Invoking VMGEXIT' section of the GHCB specification for more details. Signed-off-by: Michael Roth Message-ID: <20240501071048.2208265-4-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- arch/x86/kvm/svm/sev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a87b14691744..3ba67ee2e797 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2901,6 +2901,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_AP_JUMP_TABLE: case SVM_VMGEXIT_UNSUPPORTED_EVENT: case SVM_VMGEXIT_HV_FEATURES: + case SVM_VMGEXIT_TERM_REQUEST: break; default: reason = GHCB_ERR_INVALID_EVENT; @@ -3303,6 +3304,14 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = 1; break; + case SVM_VMGEXIT_TERM_REQUEST: + pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n", + control->exit_info_1, control->exit_info_2); + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", -- Gitee From 6e4bd6848309f8a9f0b1055d0a6840134dc038c1 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 1 May 2024 02:10:48 -0500 Subject: [PATCH 20/75] KVM: SEV: Allow per-guest configuration of GHCB protocol version ANBZ: #34453 commit 4af663c2f64a8d252e690c60cf8b8abf22dc2951 upstream. The GHCB protocol version may be different from one guest to the next. Add a field to track it for each KVM instance and extend KVM_SEV_INIT2 to allow it to be configured by userspace. Now that all SEV-ES support for GHCB protocol version 2 is in place, go ahead and default to it when creating SEV-ES guests through the new KVM_SEV_INIT2 interface. Keep the older KVM_SEV_ES_INIT interface restricted to GHCB protocol version 1. Suggested-by: Sean Christopherson Signed-off-by: Michael Roth Message-ID: <20240501071048.2208265-5-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- .../virt/kvm/x86/amd-memory-encryption.rst | 11 +++++-- arch/x86/include/uapi/asm/kvm.h | 4 ++- arch/x86/kvm/svm/sev.c | 32 +++++++++++++++++-- arch/x86/kvm/svm/svm.h | 1 + 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 3381556d596d..9677a0714a39 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -95,13 +95,19 @@ Returns: 0 on success, -negative on error struct kvm_sev_init { __u64 vmsa_features; /* initial value of features field in VMSA */ __u32 flags; /* must be 0 */ - __u32 pad[9]; + __u16 ghcb_version; /* maximum guest GHCB version allowed */ + __u16 pad1; + __u32 pad2[8]; }; It is an error if the hypervisor does not support any of the bits that are set in ``flags`` or ``vmsa_features``. ``vmsa_features`` must be 0 for SEV virtual machines, as they do not have a VMSA. +``ghcb_version`` must be 0 for SEV virtual machines, as they do not issue GHCB +requests. If ``ghcb_version`` is 0 for any other guest type, then the maximum +allowed guest GHCB protocol will default to version 2. + This command replaces the deprecated KVM_SEV_INIT and KVM_SEV_ES_INIT commands. The commands did not have any parameters (the ```data``` field was unused) and only work for the KVM_X86_DEFAULT_VM machine type (0). @@ -112,7 +118,8 @@ They behave as if: KVM_SEV_ES_INIT * the ``flags`` and ``vmsa_features`` fields of ``struct kvm_sev_init`` are - set to zero + set to zero, and ``ghcb_version`` is set to 0 for KVM_SEV_INIT and 1 for + KVM_SEV_ES_INIT. If the ``KVM_X86_SEV_VMSA_FEATURES`` attribute does not exist, the hypervisor only supports KVM_SEV_INIT and KVM_SEV_ES_INIT. In that case, note that KVM_SEV_ES_INIT diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index e85d59dd6e1e..74248345fd9d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -704,7 +704,9 @@ struct kvm_sev_cmd { struct kvm_sev_init { __u64 vmsa_features; __u32 flags; - __u32 pad[9]; + __u16 ghcb_version; + __u16 pad1; + __u32 pad2[8]; }; struct kvm_sev_launch_start { diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 3ba67ee2e797..813de9a74fe5 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -36,7 +36,8 @@ #include "csv.h" -#define GHCB_VERSION_MAX 1ULL +#define GHCB_VERSION_MAX 2ULL +#define GHCB_VERSION_DEFAULT 2ULL #define GHCB_VERSION_MIN 1ULL #define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP @@ -338,12 +339,24 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (data->vmsa_features & ~valid_vmsa_features) return -EINVAL; + if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) + return -EINVAL; + if (unlikely(sev->active)) return ret; sev->active = true; sev->es_active = es_active; sev->vmsa_features = data->vmsa_features; + sev->ghcb_version = data->ghcb_version; + + /* + * Currently KVM supports the full range of mandatory features defined + * by version 2 of the GHCB protocol, so default to that for SEV-ES + * guests created via KVM_SEV_INIT2. + */ + if (sev->es_active && !sev->ghcb_version) + sev->ghcb_version = GHCB_VERSION_DEFAULT; #ifdef CONFIG_KVM_SUPPORTS_CSV_REUSE_ASID @@ -417,6 +430,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_init data = { .vmsa_features = 0, + .ghcb_version = 0, }; unsigned long vm_type; @@ -424,6 +438,14 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EINVAL; vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM); + + /* + * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will + * continue to only ever support the minimal GHCB protocol version. + */ + if (vm_type == KVM_X86_SEV_ES_VM) + data.ghcb_version = GHCB_VERSION_MIN; + return __sev_guest_init(kvm, argp, &data, vm_type); } @@ -3101,6 +3123,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; u64 ghcb_info; int ret = 1; @@ -3111,7 +3134,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) switch (ghcb_info) { case GHCB_MSR_SEV_INFO_REQ: - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); break; @@ -3467,11 +3490,14 @@ void sev_init_vmcb(struct vcpu_svm *svm) void sev_es_vcpu_reset(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + /* * Set the GHCB MSR value as per the GHCB specification when emulating * vCPU RESET for an SEV-ES guest. */ - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 470123aac8ff..56f7df543586 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -90,6 +90,7 @@ struct kvm_sev_info { struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ u64 vmsa_features; + u16 ghcb_version; /* Highest guest GHCB protocol version allowed */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ struct list_head mirror_vms; /* List of VMs mirroring */ struct list_head mirror_entry; /* Use as a list entry of mirrors */ -- Gitee From 1c79566ee0fe40a00f03d0993f7d467be0a2f0b6 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 29 Mar 2024 16:24:42 -0500 Subject: [PATCH 21/75] mm: Introduce AS_INACCESSIBLE for encrypted/confidential memory ANBZ: #34453 commit c72ceafbd12cf95e088681ae5e535ef1a78bf0ed upstream. filemap users like guest_memfd may use page cache pages to allocate/manage memory that is only intended to be accessed by guests via hardware protections like encryption. Writes to memory of this sort in common paths like truncation may cause unexpected behavior such as writing garbage instead of zeros when attempting to zero pages, or worse, triggering hardware protections that are considered fatal as far as the kernel is concerned. Introduce a new address_space flag, AS_INACCESSIBLE, and use this initially to prevent zero'ing of pages during truncation, with the understanding that it is up to the owner of the mapping to handle this specially if needed. This is admittedly a rather blunt solution, but it seems like there are no other places that should take into account the flag to keep its promise. Link: https://lore.kernel.org/lkml/ZR9LYhpxTaTk6PJX@google.com/ Cc: Matthew Wilcox Suggested-by: Sean Christopherson Signed-off-by: Michael Roth Message-ID: <20240329212444.395559-5-michael.roth@amd.com> Acked-by: Vlastimil Babka Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- include/linux/pagemap.h | 1 + mm/truncate.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 322defe67e05..3ca97366bfc6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -216,6 +216,7 @@ enum mapping_flags { folio contents */ AS_UNMOVABLE = 8, /* The mapping cannot be moved, ever */ AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, + AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping */ AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, diff --git a/mm/truncate.c b/mm/truncate.c index cfa88a3c1042..3f2451649fa0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -259,7 +259,8 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) * doing a complex calculation here, and then doing the zeroing * anyway if the page split fails. */ - folio_zero_range(folio, offset, length); + if (!(folio->mapping->flags & AS_INACCESSIBLE)) + folio_zero_range(folio, offset, length); if (folio_needs_release(folio)) folio_invalidate(folio, offset, length); -- Gitee From d321ef448d97d85fdba0a71306974e9e07014f9b Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 29 Mar 2024 16:24:43 -0500 Subject: [PATCH 22/75] KVM: guest_memfd: Use AS_INACCESSIBLE when creating guest_memfd inode ANBZ: #34453 commit 1d23040caa8bef867572ae814b5f8b5fa44eccd3 upstream. truncate_inode_pages_range() may attempt to zero pages before truncating them, and this will occur before arch-specific invalidations can be triggered via .invalidate_folio/.free_folio hooks via kvm_gmem_aops. For AMD SEV-SNP this would result in an RMP #PF being generated by the hardware, which is currently treated as fatal (and even if specifically allowed for, would not result in anything other than garbage being written to guest pages due to encryption). On Intel TDX this would also result in undesirable behavior. Set the AS_INACCESSIBLE flag to prevent the MM from attempting unexpected accesses of this sort during operations like truncation. This may also in some cases yield a decent performance improvement for guest_memfd userspace implementations that hole-punch ranges immediately after private->shared conversions via KVM_SET_MEMORY_ATTRIBUTES, since the current implementation of truncate_inode_pages_range() always ends up zero'ing an entire 4K range if it is backing by a 2M folio. Link: https://lore.kernel.org/lkml/ZR9LYhpxTaTk6PJX@google.com/ Suggested-by: Sean Christopherson Signed-off-by: Michael Roth Message-ID: <20240329212444.395559-6-michael.roth@amd.com> Acked-by: Vlastimil Babka Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index fda1c9482b99..e88042651f1f 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -361,6 +361,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; inode->i_mapping->a_ops = &kvm_gmem_aops; + inode->i_mapping->flags |= AS_INACCESSIBLE; inode->i_mode |= S_IFREG; inode->i_size = size; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); -- Gitee From 2d9377fc9c984fe20953a9a24e4077500315a279 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Feb 2024 11:31:43 -0500 Subject: [PATCH 23/75] KVM: guest_memfd: pass error up from filemap_grab_folio ANBZ: #34453 commit 70623723778a5156a03bc6e601be5df8c1fddb75 upstream. Some SNP ioctls will require the page not to be in the pagecache, and as such they will want to return EEXIST to userspace. Start by passing the error up from filemap_grab_folio. Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e88042651f1f..85c13ed1738b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -19,8 +19,8 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) /* TODO: Support huge pages. */ folio = filemap_grab_folio(inode->i_mapping, index); - if (IS_ERR_OR_NULL(folio)) - return NULL; + if (IS_ERR(folio)) + return folio; /* * Use the up-to-date flag to track whether or not the memory has been @@ -146,8 +146,8 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) } folio = kvm_gmem_get_folio(inode, index); - if (!folio) { - r = -ENOMEM; + if (IS_ERR(folio)) { + r = PTR_ERR(folio); break; } @@ -509,8 +509,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, } folio = kvm_gmem_get_folio(file_inode(file), index); - if (!folio) { - r = -ENOMEM; + if (IS_ERR(folio)) { + r = PTR_ERR(folio); goto out_fput; } -- Gitee From 2c3928d416942a4d15dca4e39a87a044eb19b7b8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Apr 2024 13:25:19 -0400 Subject: [PATCH 24/75] KVM: guest_memfd: limit overzealous WARN ANBZ: #34453 commit fa30b0dc91c815b9579d6f758437c35db059f5ae upstream. Because kvm_gmem_get_pfn() is called from the page fault path without any of the slots_lock, filemap lock or mmu_lock taken, it is possible for it to race with kvm_gmem_unbind(). This is not a problem, as any PTE that is installed temporarily will be zapped before the guest has the occasion to run. However, it is not possible to have a complete unbind+bind racing with the page fault, because deleting the memslot will call synchronize_srcu_expedited() and wait for the page fault to be resolved. Thus, we can still warn if the file is there and is not the one we expect. Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 85c13ed1738b..55fdc5889b26 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -503,7 +503,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gmem = file->private_data; - if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) { + if (xa_load(&gmem->bindings, index) != slot) { + WARN_ON_ONCE(xa_load(&gmem->bindings, index)); r = -EIO; goto out_fput; } -- Gitee From 7121b96c115290db1ca7dcb50f51dbcfe3dc3bf5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 May 2024 12:54:03 -0400 Subject: [PATCH 25/75] KVM: guest_memfd: Add hook for initializing memory ANBZ: #34453 commit 3bb2531e20bff99f9cd8719ee7aea8bd82687173 upstream. guest_memfd pages are generally expected to be in some arch-defined initial state prior to using them for guest memory. For SEV-SNP this initial state is 'private', or 'guest-owned', and requires additional operations to move these pages into a 'private' state by updating the corresponding entries the RMP table. Allow for an arch-defined hook to handle updates of this sort, and go ahead and implement one for x86 so KVM implementations like AMD SVM can register a kvm_x86_ops callback to handle these updates for SEV-SNP guests. The preparation callback is always called when allocating/grabbing folios via gmem, and it is up to the architecture to keep track of whether or not the pages are already in the expected state (e.g. the RMP table in the case of SEV-SNP). In some cases, it is necessary to defer the preparation of the pages to handle things like in-place encryption of initial guest memory payloads before marking these pages as 'private'/'guest-owned'. Add an argument (always true for now) to kvm_gmem_get_folio() that allows for the preparation callback to be bypassed. To detect possible issues in the way userspace initializes memory, it is only possible to add an unprepared page if it is not already included in the filemap. Link: https://lore.kernel.org/lkml/ZLqVdvsF11Ddo7Dq@google.com/ Co-developed-by: Michael Roth Signed-off-by: Michael Roth Message-Id: <20231230172351.574091-5-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 6 ++++ include/linux/kvm_host.h | 5 +++ virt/kvm/Kconfig | 4 +++ virt/kvm/guest_memfd.c | 51 ++++++++++++++++++++++++++++-- 6 files changed, 65 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 3eca5b380d4d..ecfdb076ffcb 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -136,6 +136,7 @@ KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); KVM_X86_OP_OPTIONAL(get_untagged_addr) KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) +KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) KVM_X86_OP_OPTIONAL(vm_attestation) KVM_X86_OP_OPTIONAL(arch_hypercall) KVM_X86_OP_OPTIONAL(control_pre_system_reset) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 41ddace7b521..dc6338322676 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1815,6 +1815,7 @@ struct kvm_x86_ops { gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); + int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); /* * Interfaces for HYGON CSV guest diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 07ccc0a9e056..1af95b8becf1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13596,6 +13596,12 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) +{ + return static_call(kvm_x86_gmem_prepare)(kvm, pfn, gfn, max_order); +} +#endif int kvm_spec_ctrl_test_value(u64 value) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6f624641e46a..8cbb23f6cd20 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2482,4 +2482,9 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, } #endif /* CONFIG_KVM_PRIVATE_MEM */ +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); +bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); +#endif + #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 30ee7340409f..3e13492e4e6b 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -109,3 +109,7 @@ config KVM_GENERIC_PRIVATE_MEM select KVM_GENERIC_MEMORY_ATTRIBUTES select KVM_PRIVATE_MEM bool + +config HAVE_KVM_GMEM_PREPARE + bool + depends on KVM_PRIVATE_MEM diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 55fdc5889b26..9ecf9aafd92d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -13,7 +13,43 @@ struct kvm_gmem { struct list_head entry; }; -static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) +{ +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE + struct list_head *gmem_list = &inode->i_mapping->i_private_list; + struct kvm_gmem *gmem; + + list_for_each_entry(gmem, gmem_list, entry) { + struct kvm_memory_slot *slot; + struct kvm *kvm = gmem->kvm; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + int rc; + + if (!kvm_arch_gmem_prepare_needed(kvm)) + continue; + + slot = xa_load(&gmem->bindings, index); + if (!slot) + continue; + + page = folio_file_page(folio, index); + pfn = page_to_pfn(page); + gfn = slot->base_gfn + index - slot->gmem.pgoff; + rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page))); + if (rc) { + pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx, error %d.\n", + index, rc); + return rc; + } + } + +#endif + return 0; +} + +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare) { struct folio *folio; @@ -41,6 +77,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) folio_mark_uptodate(folio); } + if (prepare) { + int r = kvm_gmem_prepare_folio(inode, index, folio); + if (r < 0) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(r); + } + } + /* * Ignore accessed, referenced, and dirty flags. The memory is * unevictable and there is no storage to write back to. @@ -145,7 +190,7 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) break; } - folio = kvm_gmem_get_folio(inode, index); + folio = kvm_gmem_get_folio(inode, index, true); if (IS_ERR(folio)) { r = PTR_ERR(folio); break; @@ -509,7 +554,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, goto out_fput; } - folio = kvm_gmem_get_folio(file_inode(file), index); + folio = kvm_gmem_get_folio(file_inode(file), index, true); if (IS_ERR(folio)) { r = PTR_ERR(folio); goto out_fput; -- Gitee From 4ee65cee7a68f6d15ef2d447278602ca0b141736 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Apr 2024 13:27:46 -0400 Subject: [PATCH 26/75] KVM: guest_memfd: extract __kvm_gmem_get_pfn() ANBZ: #34453 commit 17573fd971f9e31ddee420eca8359ceff87e9e51 upstream. In preparation for adding a function that walks a set of pages provided by userspace and populates them in a guest_memfd, add a version of kvm_gmem_get_pfn() that has a "bool prepare" argument and passes it down to kvm_gmem_get_folio(). Populating guest memory has to call repeatedly __kvm_gmem_get_pfn() on the same file, so make the new function take struct file*. Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 9ecf9aafd92d..634d9663e28e 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -532,33 +532,29 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) fput(file); } -int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) { pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; - struct kvm_gmem *gmem; + struct kvm_gmem *gmem = file->private_data; struct folio *folio; struct page *page; - struct file *file; int r; - file = kvm_gmem_get_file(slot); - if (!file) + if (file != slot->gmem.file) { + WARN_ON_ONCE(slot->gmem.file); return -EFAULT; + } gmem = file->private_data; - if (xa_load(&gmem->bindings, index) != slot) { WARN_ON_ONCE(xa_load(&gmem->bindings, index)); - r = -EIO; - goto out_fput; + return -EIO; } - folio = kvm_gmem_get_folio(file_inode(file), index, true); - if (IS_ERR(folio)) { - r = PTR_ERR(folio); - goto out_fput; - } + folio = kvm_gmem_get_folio(file_inode(file), index, prepare); + if (IS_ERR(folio)) + return PTR_ERR(folio); if (folio_test_hwpoison(folio)) { r = -EHWPOISON; @@ -575,9 +571,21 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, out_unlock: folio_unlock(folio); -out_fput: - fput(file); return r; } + +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +{ + struct file *file = kvm_gmem_get_file(slot); + int r; + + if (!file) + return -EFAULT; + + r = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true); + fput(file); + return r; +} EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); -- Gitee From 8e0fb2d04cc7ccb9e0cb18188e02256ead3fb38d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Feb 2024 12:09:06 -0500 Subject: [PATCH 27/75] KVM: guest_memfd: Add interface for populating gmem pages with user data ANBZ: #34453 commit 1f6c06b177513e8a47c43e95d1985dbd9cff3ddd upstream. During guest run-time, kvm_arch_gmem_prepare() is issued as needed to prepare newly-allocated gmem pages prior to mapping them into the guest. In the case of SEV-SNP, this mainly involves setting the pages to private in the RMP table. However, for the GPA ranges comprising the initial guest payload, which are encrypted/measured prior to starting the guest, the gmem pages need to be accessed prior to setting them to private in the RMP table so they can be initialized with the userspace-provided data. Additionally, an SNP firmware call is needed afterward to encrypt them in-place and measure the contents into the guest's launch digest. While it is possible to bypass the kvm_arch_gmem_prepare() hooks so that this handling can be done in an open-coded/vendor-specific manner, this may expose more gmem-internal state/dependencies to external callers than necessary. Try to avoid this by implementing an interface that tries to handle as much of the common functionality inside gmem as possible, while also making it generic enough to potentially be usable/extensible for TDX as well. Suggested-by: Sean Christopherson Signed-off-by: Michael Roth Co-developed-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- include/linux/kvm_host.h | 27 +++++++++++++++++++++ virt/kvm/guest_memfd.c | 52 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8cbb23f6cd20..8178fb11ca72 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2487,4 +2487,31 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); #endif +/** + * kvm_gmem_populate() - Populate/prepare a GPA range with guest data + * + * @kvm: KVM instance + * @gfn: starting GFN to be populated + * @src: userspace-provided buffer containing data to copy into GFN range + * (passed to @post_populate, and incremented on each iteration + * if not NULL) + * @npages: number of pages to copy from userspace-buffer + * @post_populate: callback to issue for each gmem page that backs the GPA + * range + * @opaque: opaque data to pass to @post_populate callback + * + * This is primarily intended for cases where a gmem-backed GPA range needs + * to be initialized with userspace-provided data prior to being mapped into + * the guest as a private page. This should be called with the slots->lock + * held so that caller-enforced invariants regarding the expected memory + * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES. + * + * Returns the number of pages that were populated. + */ +typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, int order, void *opaque); + +long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, + kvm_gmem_populate_cb post_populate, void *opaque); + #endif diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 634d9663e28e..92c4ed92708d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -589,3 +589,55 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, return r; } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); + +long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, + kvm_gmem_populate_cb post_populate, void *opaque) +{ + struct file *file; + struct kvm_memory_slot *slot; + void __user *p; + + int ret = 0, max_order; + long i; + + lockdep_assert_held(&kvm->slots_lock); + if (npages < 0) + return -EINVAL; + + slot = gfn_to_memslot(kvm, start_gfn); + if (!kvm_slot_can_be_private(slot)) + return -EINVAL; + + file = kvm_gmem_get_file(slot); + if (!file) + return -EFAULT; + + filemap_invalidate_lock(file->f_mapping); + + npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); + for (i = 0; i < npages; i += (1 << max_order)) { + gfn_t gfn = start_gfn + i; + kvm_pfn_t pfn; + + ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false); + if (ret) + break; + + if (!IS_ALIGNED(gfn, (1 << max_order)) || + (npages - i) < (1 << max_order)) + max_order = 0; + + p = src ? src + i * PAGE_SIZE : NULL; + ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); + + put_page(pfn_to_page(pfn)); + if (ret) + break; + } + + filemap_invalidate_unlock(file->f_mapping); + + fput(file); + return ret && !i ? ret : i; +} +EXPORT_SYMBOL_GPL(kvm_gmem_populate); -- Gitee From 16716d2cdb6ef843a201e1d8acceabcec7d20329 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Sat, 30 Dec 2023 11:23:21 -0600 Subject: [PATCH 28/75] KVM: guest_memfd: Add hook for invalidating memory ANBZ: #34453 commit a90764f0e4ed5350cc9caeb384e0fb356c032587 upstream. In some cases, like with SEV-SNP, guest memory needs to be updated in a platform-specific manner before it can be safely freed back to the host. Wire up arch-defined hooks to the .free_folio kvm_gmem_aops callback to allow for special handling of this sort when freeing memory in response to FALLOC_FL_PUNCH_HOLE operations and when releasing the inode, and go ahead and define an arch-specific hook for x86 since it will be needed for handling memory used for SEV-SNP guests. Signed-off-by: Michael Roth Message-Id: <20231230172351.574091-6-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 7 +++++++ include/linux/kvm_host.h | 4 ++++ virt/kvm/Kconfig | 4 ++++ virt/kvm/guest_memfd.c | 14 ++++++++++++++ 6 files changed, 31 insertions(+) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index ecfdb076ffcb..08123587aa6a 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -137,6 +137,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); KVM_X86_OP_OPTIONAL(get_untagged_addr) KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) +KVM_X86_OP_OPTIONAL(gmem_invalidate) KVM_X86_OP_OPTIONAL(vm_attestation) KVM_X86_OP_OPTIONAL(arch_hypercall) KVM_X86_OP_OPTIONAL(control_pre_system_reset) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dc6338322676..14765da7d097 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1816,6 +1816,7 @@ struct kvm_x86_ops { gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); + void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); /* * Interfaces for HYGON CSV guest diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1af95b8becf1..ed769eef534f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13603,6 +13603,13 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord } #endif +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) +{ + static_call_cond(kvm_x86_gmem_invalidate)(start, end); +} +#endif + int kvm_spec_ctrl_test_value(u64 value) { /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8178fb11ca72..85858242bd0d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2514,4 +2514,8 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); +#endif + #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 3e13492e4e6b..0521e0782c5a 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -113,3 +113,7 @@ config KVM_GENERIC_PRIVATE_MEM config HAVE_KVM_GMEM_PREPARE bool depends on KVM_PRIVATE_MEM + +config HAVE_KVM_GMEM_INVALIDATE + bool + depends on KVM_PRIVATE_MEM diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 92c4ed92708d..4c0cbb303e28 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -347,10 +347,24 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol return MF_DELAYED; } +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +static void kvm_gmem_free_folio(struct folio *folio) +{ + struct page *page = folio_page(folio, 0); + kvm_pfn_t pfn = page_to_pfn(page); + int order = folio_order(folio); + + kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); +} +#endif + static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE + .free_folio = kvm_gmem_free_folio, +#endif }; static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, -- Gitee From b1d6ca3eab02afeb75b3fe56db6644f18794d18a Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 1 May 2024 03:51:52 -0500 Subject: [PATCH 29/75] KVM: x86: Add hook for determining max NPT mapping level ANBZ: #34453 commit f32fb32820b1139b29300733f339adbe0f10652d upstream. In the case of SEV-SNP, whether or not a 2MB page can be mapped via a 2MB mapping in the guest's nested page table depends on whether or not any subpages within the range have already been initialized as private in the RMP table. The existing mixed-attribute tracking in KVM is insufficient here, for instance: - gmem allocates 2MB page - guest issues PVALIDATE on 2MB page - guest later converts a subpage to shared - SNP host code issues PSMASH to split 2MB RMP mapping to 4K - KVM MMU splits NPT mapping to 4K - guest later converts that shared page back to private At this point there are no mixed attributes, and KVM would normally allow for 2MB NPT mappings again, but this is actually not allowed because the RMP table mappings are 4K and cannot be promoted on the hypervisor side, so the NPT mappings must still be limited to 4K to match this. Add a hook to determine the max NPT mapping size in situations like this. Suggested-by: Sean Christopherson Signed-off-by: Michael Roth Reviewed-by: Isaku Yamahata Message-ID: <20240501085210.2213060-3-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 23 +++++++++++++++++++++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 08123587aa6a..b53440626b30 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -137,6 +137,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); KVM_X86_OP_OPTIONAL(get_untagged_addr) KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) +KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level) KVM_X86_OP_OPTIONAL(gmem_invalidate) KVM_X86_OP_OPTIONAL(vm_attestation) KVM_X86_OP_OPTIONAL(arch_hypercall) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 14765da7d097..4960e0c21b0c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1817,6 +1817,7 @@ struct kvm_x86_ops { void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); + int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn); /* * Interfaces for HYGON CSV guest diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2f758f440448..0e00c6ed117f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4307,6 +4307,25 @@ static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, fault->is_private); } +static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, + u8 max_level, int gmem_order) +{ + u8 req_max_level; + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + max_level = min(kvm_max_level_for_order(gmem_order), max_level); + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + req_max_level = static_call(kvm_x86_private_max_mapping_level)(kvm, pfn); + if (req_max_level) + max_level = min(max_level, req_max_level); + + return req_max_level; +} + static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { @@ -4324,9 +4343,9 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, return r; } - fault->max_level = min(kvm_max_level_for_order(max_order), - fault->max_level); fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); + fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, + fault->max_level, max_order); return RET_PF_CONTINUE; } -- Gitee From 3c26ead5bd6f7be662e3cacf13cd4674804b2849 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 9 May 2024 18:05:29 -0500 Subject: [PATCH 30/75] KVM: MMU: Disable fast path if KVM_EXIT_MEMORY_FAULT is needed ANBZ: #34453 commit b74d002d3d587c38d01853580fd257db30edd1d0 upstream. For hardware-protected VMs like SEV-SNP guests, certain conditions like attempting to perform a write to a page which is not in the state that the guest expects it to be in can result in a nested/extended #PF which can only be satisfied by the host performing an implicit page state change to transition the page into the expected shared/private state. This is generally handled by generating a KVM_EXIT_MEMORY_FAULT event that gets forwarded to userspace to handle via KVM_SET_MEMORY_ATTRIBUTES. However, the fast_page_fault() code might misconstrue this situation as being the result of a write-protected access, and treat it as a spurious case when it sees that writes are already allowed for the sPTE. This results in the KVM MMU trying to resume the guest rather than taking any action to satisfy the real source of the #PF such as generating a KVM_EXIT_MEMORY_FAULT, resulting in the guest spinning on nested #PFs. Check for this condition and bail out of the fast path if it is detected. Suggested-by: Paolo Bonzini Suggested-by: Sean Christopherson Cc: Isaku Yamahata Reviewed-by: Isaku Yamahata Signed-off-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kvm/mmu/mmu.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0e00c6ed117f..d60d35eb4038 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3345,7 +3345,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, return RET_PF_CONTINUE; } -static bool page_fault_can_be_fast(struct kvm_page_fault *fault) +static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault) { /* * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only @@ -3356,6 +3356,26 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) if (fault->rsvd) return false; + /* + * For hardware-protected VMs, certain conditions like attempting to + * perform a write to a page which is not in the state that the guest + * expects it to be in can result in a nested/extended #PF. In this + * case, the below code might misconstrue this situation as being the + * result of a write-protected access, and treat it as a spurious case + * rather than taking any action to satisfy the real source of the #PF + * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the + * guest spinning on a #PF indefinitely, so don't attempt the fast path + * in this case. + * + * Note that the kvm_mem_is_private() check might race with an + * attribute update, but this will either result in the guest spinning + * on RET_PF_SPURIOUS until the update completes, or an actual spurious + * case might go down the slow path. Either case will resolve itself. + */ + if (kvm->arch.has_private_mem && + fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) + return false; + /* * #PF can be fast if: * @@ -3456,7 +3476,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) u64 *sptep = NULL; uint retry_count = 0; - if (!page_fault_can_be_fast(fault)) + if (!page_fault_can_be_fast(vcpu->kvm, fault)) return ret; walk_shadow_page_lockless_begin(vcpu); -- Gitee From 8a59c9a4448d787fda310bf8a19483d3addf13d0 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 1 May 2024 03:51:53 -0500 Subject: [PATCH 31/75] KVM: SEV: Select KVM_GENERIC_PRIVATE_MEM when CONFIG_KVM_AMD_SEV=y ANBZ: #34453 commit a8e31983335554193c2cb373161d08880230abfd upstream. SEV-SNP relies on private memory support to run guests, so make sure to enable that support via the CONFIG_KVM_GENERIC_PRIVATE_MEM config option. Signed-off-by: Michael Roth Message-ID: <20240501085210.2213060-4-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a5a5140989de..8c5d9abc0934 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -128,6 +128,7 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM + select KVM_GENERIC_PRIVATE_MEM help Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. -- Gitee From 6e851dbce99fbcdb64502aa7899d8824599b3a91 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 1 May 2024 03:51:54 -0500 Subject: [PATCH 32/75] KVM: SEV: Add initial SEV-SNP support ANBZ: #34453 commit 1dfe571c12cf99244b933208fb77f29471ded677 upstream. SEV-SNP builds upon existing SEV and SEV-ES functionality while adding new hardware-based security protection. SEV-SNP adds strong memory encryption and integrity protection to help prevent malicious hypervisor-based attacks such as data replay, memory re-mapping, and more, to create an isolated execution environment. Define a new KVM_X86_SNP_VM type which makes use of these capabilities and extend the KVM_SEV_INIT2 ioctl to support it. Also add a basic helper to check whether SNP is enabled and set PFERR_PRIVATE_ACCESS for private #NPFs so they are handled appropriately by KVM MMU. Signed-off-by: Brijesh Singh Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Ashish Kalra Reviewed-by: Paolo Bonzini Message-ID: <20240501085210.2213060-5-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/include/asm/svm.h | 3 ++- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/svm/sev.c | 21 ++++++++++++++++++++- arch/x86/kvm/svm/svm.c | 8 +++++++- arch/x86/kvm/svm/svm.h | 12 ++++++++++++ 5 files changed, 42 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 87a7b917d30e..9eeb26ff60ba 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -285,7 +285,8 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) -#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) +#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) +#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) struct vmcb_seg { u16 selector; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 74248345fd9d..7efbc1a926e7 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -867,5 +867,6 @@ struct kvm_hyperv_eventfd { #define KVM_X86_SW_PROTECTED_VM 1 #define KVM_X86_SEV_VM 2 #define KVM_X86_SEV_ES_VM 3 +#define KVM_X86_SNP_VM 4 #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 813de9a74fe5..43be0cec94ef 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -50,6 +50,9 @@ module_param_named(sev, sev_enabled, bool, 0444); static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); +/* enable/disable SEV-SNP support */ +static bool sev_snp_enabled; + /* enable/disable SEV-ES DebugSwap support */ static bool sev_es_debug_swap_enabled = true; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); @@ -358,6 +361,9 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (sev->es_active && !sev->ghcb_version) sev->ghcb_version = GHCB_VERSION_DEFAULT; + if (vm_type == KVM_X86_SNP_VM) + sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; + #ifdef CONFIG_KVM_SUPPORTS_CSV_REUSE_ASID /* Try reuse ASID iff userid array is available for HYGON CSV guests */ @@ -458,7 +464,8 @@ static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EINVAL; if (kvm->arch.vm_type != KVM_X86_SEV_VM && - kvm->arch.vm_type != KVM_X86_SEV_ES_VM) + kvm->arch.vm_type != KVM_X86_SEV_ES_VM && + kvm->arch.vm_type != KVM_X86_SNP_VM) return -EINVAL; if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) @@ -2466,6 +2473,10 @@ void __init sev_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SEV_ES); kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); } + if (sev_snp_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM); + } } #ifdef CONFIG_HYGON_CSV @@ -2487,6 +2498,7 @@ static void sev_install_hooks(void) void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; @@ -2581,6 +2593,7 @@ void __init sev_hardware_setup(void) } WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); sev_es_supported = true; + sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: if (boot_cpu_has(X86_FEATURE_SEV)) @@ -2598,9 +2611,15 @@ void __init sev_hardware_setup(void) 1 : (min_sev_asid > 1 ? 1 : 0), (is_x86_vendor_hygon() && hygon_csv_build >= 1810) ? max_sev_asid : min_sev_asid - 1); + if (boot_cpu_has(X86_FEATURE_SEV_SNP)) + pr_info("SEV-SNP %s (ASIDs %u - %u)\n", + sev_snp_supported ? "enabled" : "disabled", + min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; + sev_snp_enabled = sev_snp_supported; + if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) sev_es_debug_swap_enabled = false; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index da317d46049b..416c68e96e22 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2109,6 +2109,9 @@ static int npf_interception(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) error_code &= ~PFERR_SYNTHETIC_MASK; + if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) + error_code |= PFERR_PRIVATE_ACCESS; + trace_kvm_page_fault(vcpu, fault_address, error_code); return kvm_mmu_page_fault(vcpu, fault_address, error_code, static_cpu_has(X86_FEATURE_DECODEASSISTS) ? @@ -5076,8 +5079,11 @@ static int svm_vm_init(struct kvm *kvm) if (type != KVM_X86_DEFAULT_VM && type != KVM_X86_SW_PROTECTED_VM) { - kvm->arch.has_protected_state = (type == KVM_X86_SEV_ES_VM); + kvm->arch.has_protected_state = + (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM); to_kvm_sev_info(kvm)->need_init = true; + + kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); } if (!pause_filter_count || !pause_filter_thresh) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 56f7df543586..2a70cf5ca609 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -358,6 +358,18 @@ static __always_inline bool sev_es_guest(struct kvm *kvm) #endif } +static __always_inline bool sev_snp_guest(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_AMD_SEV + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) && + !WARN_ON_ONCE(!sev_es_guest(kvm)); +#else + return false; +#endif +} + static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; -- Gitee From 5db0daf6a4149105b2e2e526ea1facab9312b194 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 1 May 2024 03:51:55 -0500 Subject: [PATCH 33/75] KVM: SEV: Add KVM_SEV_SNP_LAUNCH_START command ANBZ: #34453 commit 136d8bc931c84fbe4c70c2d6e0a4d20a2aa90505 upstream. KVM_SEV_SNP_LAUNCH_START begins the launch process for an SEV-SNP guest. The command initializes a cryptographic digest context used to construct the measurement of the guest. Other commands can then at that point be used to load/encrypt data into the guest's initial launch image. For more information see the SEV-SNP specification. Signed-off-by: Brijesh Singh Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Ashish Kalra Message-ID: <20240501085210.2213060-6-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../virt/kvm/x86/amd-memory-encryption.rst | 28 ++- arch/x86/include/uapi/asm/kvm.h | 11 ++ arch/x86/kvm/svm/sev.c | 176 +++++++++++++++++- arch/x86/kvm/svm/svm.h | 1 + 4 files changed, 212 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 9677a0714a39..dd179e162a87 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -466,6 +466,30 @@ issued by the hypervisor to make the guest ready for execution. Returns: 0 on success, -negative on error +18. KVM_SEV_SNP_LAUNCH_START +---------------------------- + +The KVM_SNP_LAUNCH_START command is used for creating the memory encryption +context for the SEV-SNP guest. It must be called prior to issuing +KVM_SEV_SNP_LAUNCH_UPDATE or KVM_SEV_SNP_LAUNCH_FINISH; + +Parameters (in): struct kvm_sev_snp_launch_start + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_snp_launch_start { + __u64 policy; /* Guest policy to use. */ + __u8 gosvw[16]; /* Guest OS visible workarounds. */ + __u16 flags; /* Must be zero. */ + __u8 pad0[6]; + __u64 pad1[4]; + }; + +See SNP_LAUNCH_START in the SEV-SNP specification [snp-fw-abi]_ for further +details on the input parameters in ``struct kvm_sev_snp_launch_start``. + Device attribute API ==================== @@ -497,9 +521,11 @@ References ========== -See [white-paper]_, [api-spec]_, [amd-apm]_ and [kvm-forum]_ for more info. +See [white-paper]_, [api-spec]_, [amd-apm]_, [kvm-forum]_, and [snp-fw-abi]_ +for more info. .. [white-paper] https://developer.amd.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf .. [api-spec] https://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf .. [amd-apm] https://support.amd.com/TechDocs/24593.pdf (section 15.34) .. [kvm-forum] https://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf +.. [snp-fw-abi] https://www.amd.com/system/files/TechDocs/56860.pdf diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 7efbc1a926e7..1cc791f10c74 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -690,6 +690,9 @@ enum sev_cmd_id { /* Second time is the charm; improved versions of the above ioctls. */ KVM_SEV_INIT2, + /* SNP-specific commands */ + KVM_SEV_SNP_LAUNCH_START = 100, + KVM_SEV_NR_MAX, }; @@ -817,6 +820,14 @@ struct kvm_sev_receive_update_data { __u32 pad2; }; +struct kvm_sev_snp_launch_start { + __u64 policy; + __u8 gosvw[16]; + __u16 flags; + __u8 pad0[6]; + __u64 pad1[4]; +}; + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 43be0cec94ef..3df214eb4939 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "mmu.h" #include "x86.h" @@ -62,6 +63,21 @@ static u64 sev_supported_vmsa_features; #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + +#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) + static u8 sev_enc_bit; static DECLARE_RWSEM(sev_deactivate_lock); static DEFINE_MUTEX(sev_bitmap_lock); @@ -72,6 +88,8 @@ static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; +static int snp_decommission_context(struct kvm *kvm); + struct enc_region { struct list_head list; unsigned long npages; @@ -98,12 +116,17 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) down_write(&sev_deactivate_lock); wbinvd_on_all_cpus(); - ret = sev_guest_df_flush(&error); + + if (sev_snp_enabled) + ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); + else + ret = sev_guest_df_flush(&error); up_write(&sev_deactivate_lock); if (ret) - pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); + pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", + sev_snp_enabled ? "-SNP" : "", ret, error); return ret; } @@ -2135,6 +2158,106 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val) } } +/* + * The guest context contains all the information, keys and metadata + * associated with the guest that the firmware tracks to implement SEV + * and SNP features. The firmware stores the guest context in hypervisor + * provide page via the SNP_GCTX_CREATE command. + */ +static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct sev_data_snp_addr data = {}; + void *context; + int rc; + + /* Allocate memory for context page */ + context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); + if (!context) + return NULL; + + data.address = __psp_pa(context); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); + if (rc) { + pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d", + rc, argp->error); + snp_free_firmware_page(context); + return NULL; + } + + return context; +} + +static int snp_bind_asid(struct kvm *kvm, int *error) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_activate data = {0}; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.asid = sev_get_asid(kvm); + return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); +} + +static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_launch_start start = {0}; + struct kvm_sev_snp_launch_start params; + int rc; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + /* Don't allow userspace to allocate memory for more than 1 SNP context. */ + if (sev->snp_context) + return -EINVAL; + + sev->snp_context = snp_context_create(kvm, argp); + if (!sev->snp_context) + return -ENOTTY; + + if (params.flags) + return -EINVAL; + + if (params.policy & ~SNP_POLICY_MASK_VALID) + return -EINVAL; + + /* Check for policy bits that must be set */ + if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) || + !(params.policy & SNP_POLICY_MASK_SMT)) + return -EINVAL; + + if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) + return -EINVAL; + + start.gctx_paddr = __psp_pa(sev->snp_context); + start.policy = params.policy; + memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); + if (rc) { + pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n", + __func__, rc); + goto e_free_context; + } + + sev->fd = argp->sev_fd; + rc = snp_bind_asid(kvm, &argp->error); + if (rc) { + pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n", + __func__, rc); + goto e_free_context; + } + + return 0; + +e_free_context: + snp_decommission_context(kvm); + + return rc; +} + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -2158,6 +2281,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) goto out; } + /* + * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only + * allow the use of SNP-specific commands. + */ + if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) { + r = -EPERM; + goto out; + } + switch (sev_cmd.id) { case KVM_SEV_ES_INIT: if (!sev_es_enabled) { @@ -2222,6 +2354,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_RECEIVE_FINISH: r = sev_receive_finish(kvm, &sev_cmd); break; + case KVM_SEV_SNP_LAUNCH_START: + r = snp_launch_start(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; @@ -2418,6 +2553,31 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) return ret; } +static int snp_decommission_context(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_addr data = {}; + int ret; + + /* If context is not created then do nothing */ + if (!sev->snp_context) + return 0; + + /* Do the decommision, which will unbind the ASID from the SNP context */ + data.address = __sme_pa(sev->snp_context); + down_write(&sev_deactivate_lock); + ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); + up_write(&sev_deactivate_lock); + + if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) + return ret; + + snp_free_firmware_page(sev->snp_context); + sev->snp_context = NULL; + + return 0; +} + void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -2459,7 +2619,17 @@ void sev_vm_destroy(struct kvm *kvm) } } - sev_unbind_asid(kvm, sev->handle); + if (sev_snp_guest(kvm)) { + /* + * Decomission handles unbinding of the ASID. If it fails for + * some unexpected reason, just leak the ASID. + */ + if (snp_decommission_context(kvm)) + return; + } else { + sev_unbind_asid(kvm, sev->handle); + } + sev_asid_free(sev); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 2a70cf5ca609..18f9d70340e2 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -96,6 +96,7 @@ struct kvm_sev_info { struct list_head mirror_entry; /* Use as a list entry of mirrors */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; + void *snp_context; /* SNP guest context page */ }; struct kvm_svm { -- Gitee From 2385ded96ec01b78589eca3eda1382b21a885672 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 1 May 2024 03:51:56 -0500 Subject: [PATCH 34/75] KVM: SEV: Add KVM_SEV_SNP_LAUNCH_UPDATE command ANBZ: #34453 commit dee5a47cc7a45287ec1137edb745bb4dffbe85f6 upstream. A key aspect of a launching an SNP guest is initializing it with a known/measured payload which is then encrypted into guest memory as pre-validated private pages and then measured into the cryptographic launch context created with KVM_SEV_SNP_LAUNCH_START so that the guest can attest itself after booting. Since all private pages are provided by guest_memfd, make use of the kvm_gmem_populate() interface to handle this. The general flow is that guest_memfd will handle allocating the pages associated with the GPA ranges being initialized by each particular call of KVM_SEV_SNP_LAUNCH_UPDATE, copying data from userspace into those pages, and then the post_populate callback will do the work of setting the RMP entries for these pages to private and issuing the SNP firmware calls to encrypt/measure them. For more information see the SEV-SNP specification. Signed-off-by: Brijesh Singh Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Ashish Kalra Message-ID: <20240501085210.2213060-7-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../virt/kvm/x86/amd-memory-encryption.rst | 54 ++++ arch/x86/include/uapi/asm/kvm.h | 19 ++ arch/x86/kvm/svm/sev.c | 230 ++++++++++++++++++ 3 files changed, 303 insertions(+) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index dd179e162a87..cc16a7426d18 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -490,6 +490,60 @@ Returns: 0 on success, -negative on error See SNP_LAUNCH_START in the SEV-SNP specification [snp-fw-abi]_ for further details on the input parameters in ``struct kvm_sev_snp_launch_start``. +19. KVM_SEV_SNP_LAUNCH_UPDATE +----------------------------- + +The KVM_SEV_SNP_LAUNCH_UPDATE command is used for loading userspace-provided +data into a guest GPA range, measuring the contents into the SNP guest context +created by KVM_SEV_SNP_LAUNCH_START, and then encrypting/validating that GPA +range so that it will be immediately readable using the encryption key +associated with the guest context once it is booted, after which point it can +attest the measurement associated with its context before unlocking any +secrets. + +It is required that the GPA ranges initialized by this command have had the +KVM_MEMORY_ATTRIBUTE_PRIVATE attribute set in advance. See the documentation +for KVM_SET_MEMORY_ATTRIBUTES for more details on this aspect. + +Upon success, this command is not guaranteed to have processed the entire +range requested. Instead, the ``gfn_start``, ``uaddr``, and ``len`` fields of +``struct kvm_sev_snp_launch_update`` will be updated to correspond to the +remaining range that has yet to be processed. The caller should continue +calling this command until those fields indicate the entire range has been +processed, e.g. ``len`` is 0, ``gfn_start`` is equal to the last GFN in the +range plus 1, and ``uaddr`` is the last byte of the userspace-provided source +buffer address plus 1. In the case where ``type`` is KVM_SEV_SNP_PAGE_TYPE_ZERO, +``uaddr`` will be ignored completely. + +Parameters (in): struct kvm_sev_snp_launch_update + +Returns: 0 on success, < 0 on error, -EAGAIN if caller should retry + +:: + + struct kvm_sev_snp_launch_update { + __u64 gfn_start; /* Guest page number to load/encrypt data into. */ + __u64 uaddr; /* Userspace address of data to be loaded/encrypted. */ + __u64 len; /* 4k-aligned length in bytes to copy into guest memory.*/ + __u8 type; /* The type of the guest pages being initialized. */ + __u8 pad0; + __u16 flags; /* Must be zero. */ + __u32 pad1; + __u64 pad2[4]; + + }; + +where the allowed values for page_type are #define'd as:: + + KVM_SEV_SNP_PAGE_TYPE_NORMAL + KVM_SEV_SNP_PAGE_TYPE_ZERO + KVM_SEV_SNP_PAGE_TYPE_UNMEASURED + KVM_SEV_SNP_PAGE_TYPE_SECRETS + KVM_SEV_SNP_PAGE_TYPE_CPUID + +See the SEV-SNP spec [snp-fw-abi]_ for further details on how each page type is +used/measured. + Device attribute API ==================== diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 1cc791f10c74..90fd1739e3ae 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -692,6 +692,7 @@ enum sev_cmd_id { /* SNP-specific commands */ KVM_SEV_SNP_LAUNCH_START = 100, + KVM_SEV_SNP_LAUNCH_UPDATE, KVM_SEV_NR_MAX, }; @@ -828,6 +829,24 @@ struct kvm_sev_snp_launch_start { __u64 pad1[4]; }; +/* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 +#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 +#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 +#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5 +#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6 + +struct kvm_sev_snp_launch_update { + __u64 gfn_start; + __u64 uaddr; + __u64 len; + __u8 type; + __u8 pad0; + __u16 flags; + __u32 pad1; + __u64 pad2[4]; +}; + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 3df214eb4939..0645671589d0 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -328,6 +328,45 @@ static void sev_decommission(unsigned int handle) sev_guest_decommission(&decommission, NULL); } +/* + * Certain page-states, such as Pre-Guest and Firmware pages (as documented + * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be + * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE + * unless they are reclaimed first. + * + * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they + * might not be usable by the host due to being set as immutable or still + * being associated with a guest ASID. + */ +static int snp_page_reclaim(u64 pfn) +{ + struct sev_data_snp_page_reclaim data = {0}; + int err, rc; + + data.paddr = __sme_set(pfn << PAGE_SHIFT); + rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); + if (WARN_ONCE(rc, "Failed to reclaim PFN %llx", pfn)) + snp_leak_pages(pfn, 1); + + return rc; +} + +/* + * Transition a page to hypervisor-owned/shared state in the RMP table. This + * should not fail under normal conditions, but leak the page should that + * happen since it will no longer be usable by the host due to RMP protections. + */ +static int host_rmp_make_shared(u64 pfn, enum pg_level level) +{ + int rc; + + rc = rmp_make_shared(pfn, level); + if (WARN_ON_ONCE(rc)) + snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); + + return rc; +} + static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) { struct sev_data_deactivate deactivate; @@ -2258,6 +2297,194 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) return rc; } +struct sev_gmem_populate_args { + __u8 type; + int sev_fd; + int fw_error; +}; + +static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn, + void __user *src, int order, void *opaque) +{ + struct sev_gmem_populate_args *sev_populate_args = opaque; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + int n_private = 0, ret, i; + int npages = (1 << order); + gfn_t gfn; + + if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) + return -EINVAL; + + for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { + struct sev_data_snp_launch_update fw_args = {0}; + bool assigned; + int level; + + if (!kvm_mem_is_private(kvm, gfn)) { + pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n", + __func__, gfn); + ret = -EINVAL; + goto err; + } + + ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); + if (ret || assigned) { + pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", + __func__, gfn, ret, assigned); + ret = -EINVAL; + goto err; + } + + if (src) { + void *vaddr = kmap_local_pfn(pfn + i); + + ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE); + if (ret) + goto err; + kunmap_local(vaddr); + } + + ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K, + sev_get_asid(kvm), true); + if (ret) + goto err; + + n_private++; + + fw_args.gctx_paddr = __psp_pa(sev->snp_context); + fw_args.address = __sme_set(pfn_to_hpa(pfn + i)); + fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); + fw_args.page_type = sev_populate_args->type; + + ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &fw_args, &sev_populate_args->fw_error); + if (ret) + goto fw_err; + } + + return 0; + +fw_err: + /* + * If the firmware command failed handle the reclaim and cleanup of that + * PFN specially vs. prior pages which can be cleaned up below without + * needing to reclaim in advance. + * + * Additionally, when invalid CPUID function entries are detected, + * firmware writes the expected values into the page and leaves it + * unencrypted so it can be used for debugging and error-reporting. + * + * Copy this page back into the source buffer so userspace can use this + * information to provide information on which CPUID leaves/fields + * failed CPUID validation. + */ + if (!snp_page_reclaim(pfn + i) && !host_rmp_make_shared(pfn + i, PG_LEVEL_4K) && + sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && + sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { + void *vaddr = kmap_local_pfn(pfn + i); + + if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE)) + pr_debug("Failed to write CPUID page back to userspace\n"); + + kunmap_local(vaddr); + } + + /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */ + n_private--; + +err: + pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n", + __func__, ret, sev_populate_args->fw_error, n_private); + for (i = 0; i < n_private; i++) + host_rmp_make_shared(pfn + i, PG_LEVEL_4K); + + return ret; +} + +static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_gmem_populate_args sev_populate_args = {0}; + struct kvm_sev_snp_launch_update params; + struct kvm_memory_slot *memslot; + long npages, count; + void __user *src; + int ret = 0; + + if (!sev_snp_guest(kvm) || !sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, + params.gfn_start, params.len, params.type, params.flags); + + if (!PAGE_ALIGNED(params.len) || params.flags || + (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && + params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && + params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && + params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS && + params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) + return -EINVAL; + + npages = params.len / PAGE_SIZE; + + /* + * For each GFN that's being prepared as part of the initial guest + * state, the following pre-conditions are verified: + * + * 1) The backing memslot is a valid private memslot. + * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES + * beforehand. + * 3) The PFN of the guest_memfd has not already been set to private + * in the RMP table. + * + * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page + * faults if there's a race between a fault and an attribute update via + * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized + * here. However, kvm->slots_lock guards against both this as well as + * concurrent memslot updates occurring while these checks are being + * performed, so use that here to make it easier to reason about the + * initial expected state and better guard against unexpected + * situations. + */ + mutex_lock(&kvm->slots_lock); + + memslot = gfn_to_memslot(kvm, params.gfn_start); + if (!kvm_slot_can_be_private(memslot)) { + ret = -EINVAL; + goto out; + } + + sev_populate_args.sev_fd = argp->sev_fd; + sev_populate_args.type = params.type; + src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); + + count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, + sev_gmem_post_populate, &sev_populate_args); + if (count < 0) { + argp->error = sev_populate_args.fw_error; + pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", + __func__, count, argp->error); + ret = -EIO; + } else { + params.gfn_start += count; + params.len -= count * PAGE_SIZE; + if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) + params.uaddr += count * PAGE_SIZE; + + ret = 0; + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) + ret = -EFAULT; + } + +out: + mutex_unlock(&kvm->slots_lock); + + return ret; +} + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -2357,6 +2584,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_SNP_LAUNCH_START: r = snp_launch_start(kvm, &sev_cmd); break; + case KVM_SEV_SNP_LAUNCH_UPDATE: + r = snp_launch_update(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; -- Gitee From 765f765435b1b94c0d3670c653f84191960b6783 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 1 May 2024 03:51:57 -0500 Subject: [PATCH 35/75] KVM: SEV: Add KVM_SEV_SNP_LAUNCH_FINISH command ANBZ: #34453 commit ad27ce155566f2b4400fa865859834592bd18777 upstream. Add a KVM_SEV_SNP_LAUNCH_FINISH command to finalize the cryptographic launch digest which stores the measurement of the guest at launch time. Also extend the existing SNP firmware data structures to support disabling the use of Versioned Chip Endorsement Keys (VCEK) by guests as part of this command. While finalizing the launch flow, the code also issues the LAUNCH_UPDATE SNP firmware commands to encrypt/measure the initial VMSA pages for each configured vCPU, which requires setting the RMP entries for those pages to private, so also add handling to clean up the RMP entries for these pages whening freeing vCPUs during shutdown. Signed-off-by: Brijesh Singh Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Harald Hoyer Signed-off-by: Ashish Kalra Message-ID: <20240501085210.2213060-8-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- .../virt/kvm/x86/amd-memory-encryption.rst | 28 ++++ arch/x86/include/uapi/asm/kvm.h | 17 +++ arch/x86/kvm/svm/sev.c | 127 ++++++++++++++++++ include/linux/psp-sev.h | 4 +- 4 files changed, 175 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index cc16a7426d18..1ddb6a86ce7f 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -544,6 +544,34 @@ where the allowed values for page_type are #define'd as:: See the SEV-SNP spec [snp-fw-abi]_ for further details on how each page type is used/measured. +20. KVM_SEV_SNP_LAUNCH_FINISH +----------------------------- + +After completion of the SNP guest launch flow, the KVM_SEV_SNP_LAUNCH_FINISH +command can be issued to make the guest ready for execution. + +Parameters (in): struct kvm_sev_snp_launch_finish + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 vcek_disabled; + __u8 host_data[32]; + __u8 pad0[3]; + __u16 flags; /* Must be zero */ + __u64 pad1[4]; + }; + + +See SNP_LAUNCH_FINISH in the SEV-SNP specification [snp-fw-abi]_ for further +details on the input parameters in ``struct kvm_sev_snp_launch_finish``. + Device attribute API ==================== diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 90fd1739e3ae..ddc9691054e5 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -693,6 +693,7 @@ enum sev_cmd_id { /* SNP-specific commands */ KVM_SEV_SNP_LAUNCH_START = 100, KVM_SEV_SNP_LAUNCH_UPDATE, + KVM_SEV_SNP_LAUNCH_FINISH, KVM_SEV_NR_MAX, }; @@ -847,6 +848,22 @@ struct kvm_sev_snp_launch_update { __u64 pad2[4]; }; +#define KVM_SEV_SNP_ID_BLOCK_SIZE 96 +#define KVM_SEV_SNP_ID_AUTH_SIZE 4096 +#define KVM_SEV_SNP_FINISH_DATA_SIZE 32 + +struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 vcek_disabled; + __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE]; + __u8 pad0[3]; + __u16 flags; + __u64 pad1[4]; +}; + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0645671589d0..ed282b13de95 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -78,6 +78,8 @@ static u64 sev_supported_vmsa_features; SNP_POLICY_MASK_DEBUG | \ SNP_POLICY_MASK_SINGLE_SOCKET) +#define INITIAL_VMSA_GPA 0xFFFFFFFFF000 + static u8 sev_enc_bit; static DECLARE_RWSEM(sev_deactivate_lock); static DEFINE_MUTEX(sev_bitmap_lock); @@ -2485,6 +2487,115 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) return ret; } +static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_launch_update data = {}; + struct kvm_vcpu *vcpu; + unsigned long i; + int ret; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.page_type = SNP_PAGE_TYPE_VMSA; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vcpu_svm *svm = to_svm(vcpu); + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + ret = sev_es_sync_vmsa(svm); + if (ret) + return ret; + + /* Transition the VMSA page to a firmware state. */ + ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); + if (ret) + return ret; + + /* Issue the SNP command to encrypt the VMSA */ + data.address = __sme_pa(svm->sev_es.vmsa); + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &data, &argp->error); + if (ret) { + if (!snp_page_reclaim(pfn)) + host_rmp_make_shared(pfn, PG_LEVEL_4K); + + return ret; + } + + svm->vcpu.arch.guest_state_protected = true; + } + + return 0; +} + +static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_snp_launch_finish params; + struct sev_data_snp_launch_finish *data; + void *id_block = NULL, *id_auth = NULL; + int ret; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (!sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + if (params.flags) + return -EINVAL; + + /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */ + ret = snp_launch_update_vmsa(kvm, argp); + if (ret) + return ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + if (params.id_block_en) { + id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); + if (IS_ERR(id_block)) { + ret = PTR_ERR(id_block); + goto e_free; + } + + data->id_block_en = 1; + data->id_block_paddr = __sme_pa(id_block); + + id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); + if (IS_ERR(id_auth)) { + ret = PTR_ERR(id_auth); + goto e_free_id_block; + } + + data->id_auth_paddr = __sme_pa(id_auth); + + if (params.auth_key_en) + data->auth_key_en = 1; + } + + data->vcek_disabled = params.vcek_disabled; + + memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE); + data->gctx_paddr = __psp_pa(sev->snp_context); + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); + + kfree(id_auth); + +e_free_id_block: + kfree(id_block); + +e_free: + kfree(data); + + return ret; +} + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -2587,6 +2698,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_SNP_LAUNCH_UPDATE: r = snp_launch_update(kvm, &sev_cmd); break; + case KVM_SEV_SNP_LAUNCH_FINISH: + r = snp_launch_finish(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; @@ -3131,6 +3245,18 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); + /* + * If it's an SNP guest, then the VMSA was marked in the RMP table as + * a guest-owned page. Transition the page to hypervisor state before + * releasing it back to the system. + */ + if (sev_snp_guest(vcpu->kvm)) { + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + if (host_rmp_make_shared(pfn, PG_LEVEL_4K)) + goto skip_vmsa_free; + } + if (vcpu->arch.guest_state_protected) sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); @@ -3139,6 +3265,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) if (svm->sev_es.ghcb) kvm_vcpu_unmap(vcpu, &svm->sev_es.ghcb_map, false); +skip_vmsa_free: if (svm->sev_es.ghcb_sa_free) kvfree(svm->sev_es.ghcb_sa); diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 2ced37236ed3..69442badf343 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -659,6 +659,7 @@ struct sev_data_snp_launch_update { * @id_auth_paddr: system physical address of ID block authentication structure * @id_block_en: indicates whether ID block is present * @auth_key_en: indicates whether author key is present in authentication structure + * @vcek_disabled: indicates whether use of VCEK is allowed for attestation reports * @rsvd: reserved * @host_data: host-supplied data for guest, not interpreted by firmware */ @@ -668,7 +669,8 @@ struct sev_data_snp_launch_finish { u64 id_auth_paddr; u8 id_block_en:1; u8 auth_key_en:1; - u64 rsvd:62; + u8 vcek_disabled:1; + u64 rsvd:61; u8 host_data[32]; } __packed; -- Gitee From a3677039f468d2da844bbd220d0c583836cc1a0a Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 1 May 2024 03:51:58 -0500 Subject: [PATCH 36/75] KVM: SEV: Add support to handle GHCB GPA register VMGEXIT ANBZ: #34453 commit 0c76b1d08280649f789e1b537b397cefc43da7a0 upstream. SEV-SNP guests are required to perform a GHCB GPA registration. Before using a GHCB GPA for a vCPU the first time, a guest must register the vCPU GHCB GPA. If hypervisor can work with the guest requested GPA then it must respond back with the same GPA otherwise return -1. On VMEXIT, verify that the GHCB GPA matches with the registered value. If a mismatch is detected, then abort the guest. Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Signed-off-by: Michael Roth Message-ID: <20240501085210.2213060-9-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/include/asm/sev-common.h | 8 ++++++ arch/x86/kvm/svm/sev.c | 48 +++++++++++++++++++++++++++---- arch/x86/kvm/svm/svm.h | 7 +++++ 3 files changed, 57 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 5a8246dd532f..1006bfffe07a 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -59,6 +59,14 @@ #define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12 #define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0) +/* Preferred GHCB GPA Request */ +#define GHCB_MSR_PREF_GPA_REQ 0x010 +#define GHCB_MSR_GPA_VALUE_POS 12 +#define GHCB_MSR_GPA_VALUE_MASK GENMASK_ULL(51, 0) + +#define GHCB_MSR_PREF_GPA_RESP 0x011 +#define GHCB_MSR_PREF_GPA_NONE 0xfffffffffffff + /* GHCB GPA Register */ #define GHCB_MSR_REG_GPA_REQ 0x012 #define GHCB_MSR_REG_GPA_REQ_VAL(v) \ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ed282b13de95..ce59c96ac9cd 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3744,6 +3744,32 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); break; + case GHCB_MSR_PREF_GPA_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_REG_GPA_REQ: { + u64 gfn; + + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + + svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); + + set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + } case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; @@ -3756,12 +3782,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; - vcpu->run->system_event.ndata = 1; - vcpu->run->system_event.data[0] = control->ghcb_gpa; - - return 0; + goto out_terminate; } default: /* Error, keep GHCB MSR value as-is */ @@ -3772,6 +3793,14 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) control->ghcb_gpa, ret); return ret; + +out_terminate: + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + + return 0; } int sev_handle_vmgexit(struct kvm_vcpu *vcpu) @@ -3807,6 +3836,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); sev_es_sync_from_ghcb(svm); + + /* SEV-SNP guest requires that the GHCB GPA must be registered */ + if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { + vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); + return -EINVAL; + } + ret = sev_es_validate_vmgexit(svm); if (ret) return ret; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 18f9d70340e2..13020fda2ef1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -211,12 +211,14 @@ struct vcpu_sev_es_state { bool ghcb_sa_sync; bool ghcb_sa_free; + #ifdef CONFIG_HYGON_CSV /* migrated ghcb mapping state for HYGON CSV2 */ bool receiver_ghcb_map_fail; /* CSV2 reboot vmsa */ struct vmcb_save_area *reset_vmsa; #endif + u64 ghcb_registered_gpa; }; struct vcpu_svm { @@ -371,6 +373,11 @@ static __always_inline bool sev_snp_guest(struct kvm *kvm) #endif } +static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val) +{ + return svm->sev_es.ghcb_registered_gpa == val; +} + static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; -- Gitee From fbe86924157778cc37aa1ec47358b63c328d78ec Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 1 May 2024 03:51:59 -0500 Subject: [PATCH 37/75] KVM: SEV: Add support to handle MSR based Page State Change VMGEXIT ANBZ: #34453 commit d46b7b6a5f9ec652a9e3ac6344f679235c9b67da upstream. SEV-SNP VMs can ask the hypervisor to change the page state in the RMP table to be private or shared using the Page State Change MSR protocol as defined in the GHCB specification. When using gmem, private/shared memory is allocated through separate pools, and KVM relies on userspace issuing a KVM_SET_MEMORY_ATTRIBUTES KVM ioctl to tell the KVM MMU whether or not a particular GFN should be backed by private memory or not. Forward these page state change requests to userspace so that it can issue the expected KVM ioctls. The KVM MMU will handle updating the RMP entries when it is ready to map a private page into a guest. Use the existing KVM_HC_MAP_GPA_RANGE hypercall format to deliver these requests to userspace via KVM_EXIT_HYPERCALL. Signed-off-by: Michael Roth Co-developed-by: Brijesh Singh Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Message-ID: <20240501085210.2213060-10-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/include/asm/sev-common.h | 6 ++++ arch/x86/kvm/svm/sev.c | 48 +++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 1006bfffe07a..6d68db812de1 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -101,11 +101,17 @@ enum psc_op { /* GHCBData[11:0] */ \ GHCB_MSR_PSC_REQ) +#define GHCB_MSR_PSC_REQ_TO_GFN(msr) (((msr) & GENMASK_ULL(51, 12)) >> 12) +#define GHCB_MSR_PSC_REQ_TO_OP(msr) (((msr) & GENMASK_ULL(55, 52)) >> 52) + #define GHCB_MSR_PSC_RESP 0x015 #define GHCB_MSR_PSC_RESP_VAL(val) \ /* GHCBData[63:32] */ \ (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) +/* Set highest bit as a generic error response */ +#define GHCB_MSR_PSC_RESP_ERROR (BIT_ULL(63) | GHCB_MSR_PSC_RESP) + /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ce59c96ac9cd..43f7b939f0bb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3665,6 +3665,48 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) svm->vmcb->control.ghcb_gpa = value; } +static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (vcpu->run->hypercall.ret) + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + else + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); + + return 1; /* resume guest */ +} + +static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) +{ + u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr)); + u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr); + struct kvm_vcpu *vcpu = &svm->vcpu; + + if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = gpa; + vcpu->run->hypercall.args[1] = 1; + vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + + vcpu->arch.complete_userspace_io = snp_complete_psc_msr; + + return 0; /* forward request to userspace */ +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3770,6 +3812,12 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_INFO_POS); break; } + case GHCB_MSR_PSC_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + ret = snp_begin_psc_msr(svm, control->ghcb_gpa); + break; case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; -- Gitee From 8a528c59d767175f32a4c108193f6fb64a258f99 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 1 May 2024 03:52:00 -0500 Subject: [PATCH 38/75] KVM: SEV: Add support to handle Page State Change VMGEXIT ANBZ: #34453 commit 9b54e248d2644be71cb394eb85f31ad99e023a05 upstream. SEV-SNP VMs can ask the hypervisor to change the page state in the RMP table to be private or shared using the Page State Change NAE event as defined in the GHCB specification version 2. Forward these requests to userspace as KVM_EXIT_VMGEXITs, similar to how it is done for requests that don't use a GHCB page. As with the MSR-based page-state changes, use the existing KVM_HC_MAP_GPA_RANGE hypercall format to deliver these requests to userspace via KVM_EXIT_HYPERCALL. Signed-off-by: Michael Roth Co-developed-by: Brijesh Singh Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Message-ID: <20240501085210.2213060-11-michael.roth@amd.com> Co-developed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- arch/x86/include/asm/sev-common.h | 11 ++ arch/x86/kvm/svm/sev.c | 188 ++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.h | 5 + 3 files changed, 204 insertions(+) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 6d68db812de1..8647cc05e2f4 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -129,8 +129,19 @@ enum psc_op { * The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which * is a local stack variable in set_pages_state(). Do not increase this value * without evaluating the impact to stack usage. + * + * Use VMGEXIT_PSC_MAX_COUNT in cases where the actual GHCB-defined max value + * is needed, such as when processing GHCB requests on the hypervisor side. */ #define VMGEXIT_PSC_MAX_ENTRY 64 +#define VMGEXIT_PSC_MAX_COUNT 253 + +#define VMGEXIT_PSC_ERROR_GENERIC (0x100UL << 32) +#define VMGEXIT_PSC_ERROR_INVALID_HDR ((1UL << 32) | 1) +#define VMGEXIT_PSC_ERROR_INVALID_ENTRY ((1UL << 32) | 2) + +#define VMGEXIT_PSC_OP_PRIVATE 1 +#define VMGEXIT_PSC_OP_SHARED 2 struct psc_hdr { u16 cur_entry; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 43f7b939f0bb..f999ec7e3c81 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3471,6 +3471,10 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_HV_FEATURES: case SVM_VMGEXIT_TERM_REQUEST: break; + case SVM_VMGEXIT_PSC: + if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) + goto vmgexit_err; + break; default: reason = GHCB_ERR_INVALID_EVENT; goto vmgexit_err; @@ -3707,6 +3711,183 @@ static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) return 0; /* forward request to userspace */ } +struct psc_buffer { + struct psc_hdr hdr; + struct psc_entry entries[]; +} __packed; + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc); + +static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) +{ + svm->sev_es.psc_inflight = 0; + svm->sev_es.psc_idx = 0; + svm->sev_es.psc_2m = false; + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret); +} + +static void __snp_complete_one_psc(struct vcpu_svm *svm) +{ + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + struct psc_entry *entries = psc->entries; + struct psc_hdr *hdr = &psc->hdr; + __u16 idx; + + /* + * Everything in-flight has been processed successfully. Update the + * corresponding entries in the guest's PSC buffer and zero out the + * count of in-flight PSC entries. + */ + for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight; + svm->sev_es.psc_inflight--, idx++) { + struct psc_entry *entry = &entries[idx]; + + entry->cur_page = entry->pagesize ? 512 : 1; + } + + hdr->cur_entry = idx; +} + +static int snp_complete_one_psc(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + + if (vcpu->run->hypercall.ret) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; /* resume guest */ + } + + __snp_complete_one_psc(svm); + + /* Handle the next range (if any). */ + return snp_begin_psc(svm, psc); +} + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) +{ + struct psc_entry *entries = psc->entries; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct psc_hdr *hdr = &psc->hdr; + struct psc_entry entry_start; + u16 idx, idx_start, idx_end; + int npages; + bool huge; + u64 gfn; + + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + +next_range: + /* There should be no other PSCs in-flight at this point. */ + if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + + /* + * The PSC descriptor buffer can be modified by a misbehaved guest after + * validation, so take care to only use validated copies of values used + * for things like array indexing. + */ + idx_start = hdr->cur_entry; + idx_end = hdr->end_entry; + + if (idx_end >= VMGEXIT_PSC_MAX_COUNT) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); + return 1; + } + + /* Find the start of the next range which needs processing. */ + for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) { + entry_start = entries[idx]; + + gfn = entry_start.gfn; + huge = entry_start.pagesize; + npages = huge ? 512 : 1; + + if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY); + return 1; + } + + if (entry_start.cur_page) { + /* + * If this is a partially-completed 2M range, force 4K handling + * for the remaining pages since they're effectively split at + * this point. Subsequent code should ensure this doesn't get + * combined with adjacent PSC entries where 2M handling is still + * possible. + */ + npages -= entry_start.cur_page; + gfn += entry_start.cur_page; + huge = false; + } + + if (npages) + break; + } + + if (idx > idx_end) { + /* Nothing more to process. */ + snp_complete_psc(svm, 0); + return 1; + } + + svm->sev_es.psc_2m = huge; + svm->sev_es.psc_idx = idx; + svm->sev_es.psc_inflight = 1; + + /* + * Find all subsequent PSC entries that contain adjacent GPA + * ranges/operations and can be combined into a single + * KVM_HC_MAP_GPA_RANGE exit. + */ + while (++idx <= idx_end) { + struct psc_entry entry = entries[idx]; + + if (entry.operation != entry_start.operation || + entry.gfn != entry_start.gfn + npages || + entry.cur_page || !!entry.pagesize != huge) + break; + + svm->sev_es.psc_inflight++; + npages += huge ? 512 : 1; + } + + switch (entry_start.operation) { + case VMGEXIT_PSC_OP_PRIVATE: + case VMGEXIT_PSC_OP_SHARED: + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); + vcpu->run->hypercall.args[1] = npages; + vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= entry_start.pagesize + ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M + : KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + vcpu->arch.complete_userspace_io = snp_complete_one_psc; + return 0; /* forward request to userspace */ + default: + /* + * Only shared/private PSC operations are currently supported, so if the + * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), + * then consider the entire range completed and avoid exiting to + * userspace. In theory snp_complete_psc() can always be called directly + * at this point to complete the current range and start the next one, + * but that could lead to unexpected levels of recursion. + */ + __snp_complete_one_psc(svm); + goto next_range; + } + + unreachable(); +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3965,6 +4146,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) vcpu->run->system_event.ndata = 1; vcpu->run->system_event.data[0] = control->ghcb_gpa; break; + case SVM_VMGEXIT_PSC: + ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); + if (ret) + break; + + ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 13020fda2ef1..73ce87974698 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -218,6 +218,11 @@ struct vcpu_sev_es_state { /* CSV2 reboot vmsa */ struct vmcb_save_area *reset_vmsa; #endif + /* SNP Page-State-Change buffer entries currently being processed */ + u16 psc_idx; + u16 psc_inflight; + bool psc_2m; + u64 ghcb_registered_gpa; }; -- Gitee From 787fe59ac6845bbb67f4a66de778414e53ad546a Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 1 May 2024 03:52:01 -0500 Subject: [PATCH 39/75] KVM: SEV: Add support to handle RMP nested page faults ANBZ: #34453 commit c63cf135cc996fc88be95a9ba8b80e3cff8d275b upstream. When SEV-SNP is enabled in the guest, the hardware places restrictions on all memory accesses based on the contents of the RMP table. When hardware encounters RMP check failure caused by the guest memory access it raises the #NPF. The error code contains additional information on the access type. See the APM volume 2 for additional information. When using gmem, RMP faults resulting from mismatches between the state in the RMP table vs. what the guest expects via its page table result in KVM_EXIT_MEMORY_FAULTs being forwarded to userspace to handle. This means the only expected case that needs to be handled in the kernel is when the page size of the entry in the RMP table is larger than the mapping in the nested page table, in which case a PSMASH instruction needs to be issued to split the large RMP entry into individual 4K entries so that subsequent accesses can succeed. Signed-off-by: Brijesh Singh Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Ashish Kalra Message-ID: <20240501085210.2213060-12-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/sev.h | 3 + arch/x86/kvm/mmu.h | 2 - arch/x86/kvm/mmu/mmu.c | 1 + arch/x86/kvm/svm/sev.c | 109 ++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 14 ++-- arch/x86/kvm/svm/svm.h | 3 + arch/x86/kvm/trace.h | 31 +++++++++ arch/x86/kvm/x86.c | 1 + 9 files changed, 159 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4960e0c21b0c..f73a6c42b94f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1938,6 +1938,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index d33c1f1b9e75..9d24caccac88 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -90,6 +90,9 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); /* RMUPDATE detected 4K page and 2MB page overlap. */ #define RMPUPDATE_FAIL_OVERLAP 4 +/* PSMASH failed due to concurrent access by another CPU */ +#define PSMASH_FAIL_INUSE 3 + /* RMP page size */ #define RMP_PG_SIZE_4K 0 #define RMP_PG_SIZE_2M 1 diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 62ef27314c13..0169c8d25ade 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -246,8 +246,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); - int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d60d35eb4038..dcacb37a95d1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6746,6 +6746,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, return need_tlb_flush; } +EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f999ec7e3c81..e96451737719 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3669,6 +3669,23 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) svm->vmcb->control.ghcb_gpa = value; } +static int snp_rmptable_psmash(kvm_pfn_t pfn) +{ + int ret; + + pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); + + /* + * PSMASH_FAIL_INUSE indicates another processor is modifying the + * entry, so retry until that's no longer the case. + */ + do { + ret = psmash(pfn); + } while (ret == PSMASH_FAIL_INUSE); + + return ret; +} + static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4428,3 +4445,95 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) return p; } + +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) +{ + struct kvm_memory_slot *slot; + struct kvm *kvm = vcpu->kvm; + int order, rmp_level, ret; + bool assigned; + kvm_pfn_t pfn; + gfn_t gfn; + + gfn = gpa >> PAGE_SHIFT; + + /* + * The only time RMP faults occur for shared pages is when the guest is + * triggering an RMP fault for an implicit page-state change from + * shared->private. Implicit page-state changes are forwarded to + * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults + * for shared pages should not end up here. + */ + if (!kvm_mem_is_private(kvm, gfn)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n", + gpa); + return; + } + + slot = gfn_to_memslot(kvm, gfn); + if (!kvm_slot_can_be_private(slot)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", + gpa); + return; + } + + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order); + if (ret) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", + gpa); + return; + } + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret || !assigned) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n", + gpa, pfn, ret); + goto out_no_trace; + } + + /* + * There are 2 cases where a PSMASH may be needed to resolve an #NPF + * with PFERR_GUEST_RMP_BIT set: + * + * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM + * bit set if the guest issues them with a smaller granularity than + * what is indicated by the page-size bit in the 2MB RMP entry for + * the PFN that backs the GPA. + * + * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is + * smaller than what is indicated by the 2MB RMP entry for the PFN + * that backs the GPA. + * + * In both these cases, the corresponding 2M RMP entry needs to + * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already + * split into 4K RMP entries, then this is likely a spurious case which + * can occur when there are concurrent accesses by the guest to a 2MB + * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in + * the process of being PMASH'd into 4K entries. These cases should + * resolve automatically on subsequent accesses, so just ignore them + * here. + */ + if (rmp_level == PG_LEVEL_4K) + goto out; + + ret = snp_rmptable_psmash(pfn); + if (ret) { + /* + * Look it up again. If it's 4K now then the PSMASH may have + * raced with another process and the issue has already resolved + * itself. + */ + if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) && + assigned && rmp_level == PG_LEVEL_4K) + goto out; + + pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n", + gpa, pfn, ret); + } + + kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); +out: + trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); +out_no_trace: + put_page(pfn_to_page(pfn)); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 416c68e96e22..5fe18683afb0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2096,6 +2096,7 @@ static int pf_interception(struct kvm_vcpu *vcpu) static int npf_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + int rc; u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; @@ -2113,10 +2114,15 @@ static int npf_interception(struct kvm_vcpu *vcpu) error_code |= PFERR_PRIVATE_ACCESS; trace_kvm_page_fault(vcpu, fault_address, error_code); - return kvm_mmu_page_fault(vcpu, fault_address, error_code, - static_cpu_has(X86_FEATURE_DECODEASSISTS) ? - svm->vmcb->control.insn_bytes : NULL, - svm->vmcb->control.insn_len); + rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, + static_cpu_has(X86_FEATURE_DECODEASSISTS) ? + svm->vmcb->control.insn_bytes : NULL, + svm->vmcb->control.insn_len); + + if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) + sev_handle_rmp_fault(vcpu, fault_address, error_code); + + return rc; } static int db_interception(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 73ce87974698..b8a83ebb8ecf 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -739,6 +739,7 @@ void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); int sev_dev_get_attr(u32 group, u64 attr, u64 *val); extern unsigned int max_sev_asid; +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); #else static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) { return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); @@ -752,6 +753,8 @@ static inline void sev_hardware_unsetup(void) {} static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } #define max_sev_asid 0 +static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} + #endif /* vmenter.S */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 15144cd5545c..11b201ed54d0 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1934,6 +1934,37 @@ TRACE_EVENT(kvm_csv3_sp_remove, __entry->track_pfn, __entry->track_hva, __entry->order) ); +/* + * Tracepoint for #NPFs due to RMP faults. + */ +TRACE_EVENT(kvm_rmp_fault, + TP_PROTO(struct kvm_vcpu *vcpu, u64 gpa, u64 pfn, u64 error_code, + int rmp_level, int psmash_ret), + TP_ARGS(vcpu, gpa, pfn, error_code, rmp_level, psmash_ret), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, gpa) + __field(u64, pfn) + __field(u64, error_code) + __field(int, rmp_level) + __field(int, psmash_ret) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gpa = gpa; + __entry->pfn = pfn; + __entry->error_code = error_code; + __entry->rmp_level = rmp_level; + __entry->psmash_ret = psmash_ret; + ), + + TP_printk("vcpu %u gpa %016llx pfn 0x%llx error_code 0x%llx rmp_level %d psmash_ret %d", + __entry->vcpu_id, __entry->gpa, __entry->pfn, + __entry->error_code, __entry->rmp_level, __entry->psmash_ret) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ed769eef534f..4b73eb950f71 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13994,6 +13994,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_csv3_sp_insert_dup); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_csv3_sp_insert); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_csv3_sp_hit); -- Gitee From 3c2534880266aa67a008cd74327ca566d84be409 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 1 May 2024 03:52:02 -0500 Subject: [PATCH 40/75] KVM: SEV: Support SEV-SNP AP Creation NAE event ANBZ: #34453 commit e366f92ea99e1961fbad5e2110900e9f4fcb249b upstream. Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Hemanth Selam Signed-off-by: mohanasv --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/svm.h | 6 + arch/x86/kvm/svm/sev.c | 231 +++++++++++++++++++++++++++++++- arch/x86/kvm/svm/svm.c | 11 +- arch/x86/kvm/svm/svm.h | 9 ++ arch/x86/kvm/x86.c | 11 ++ 6 files changed, 266 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f73a6c42b94f..9062ff09b15f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -113,6 +113,7 @@ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 9eeb26ff60ba..604cb618167a 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -286,8 +286,14 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) #define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) +#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) +#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) #define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) +#define SVM_SEV_FEAT_INT_INJ_MODES \ + (SVM_SEV_FEAT_RESTRICTED_INJECTION | \ + SVM_SEV_FEAT_ALTERNATE_INJECTION) + struct vmcb_seg { u16 selector; u16 attrib; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e96451737719..b899b71c48eb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -41,7 +41,7 @@ #define GHCB_VERSION_DEFAULT 2ULL #define GHCB_VERSION_MIN 1ULL -#define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP +#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) /* enable/disable SEV support */ static bool sev_enabled = true; @@ -3464,6 +3464,13 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; + case SVM_VMGEXIT_AP_CREATION: + if (!sev_snp_guest(vcpu->kvm)) + goto vmgexit_err; + if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) + if (!kvm_ghcb_rax_is_valid(svm)) + goto vmgexit_err; + break; case SVM_VMGEXIT_NMI_COMPLETE: case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: @@ -3905,6 +3912,205 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) unreachable(); } +static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex)); + + /* Mark the vCPU as offline and not runnable */ + vcpu->arch.pv.pv_unhalted = false; + vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + + /* Clear use of the VMSA */ + svm->vmcb->control.vmsa_pa = INVALID_PAGE; + + if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { + gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); + struct kvm_memory_slot *slot; + kvm_pfn_t pfn; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot) + return -EINVAL; + + /* + * The new VMSA will be private memory guest memory, so + * retrieve the PFN from the gmem backend. + */ + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL)) + return -EINVAL; + + /* + * From this point forward, the VMSA will always be a + * guest-mapped page rather than the initial one allocated + * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa + * could be free'd and cleaned up here, but that involves + * cleanups like wbinvd_on_all_cpus() which would ideally + * be handled during teardown rather than guest boot. + * Deferring that also allows the existing logic for SEV-ES + * VMSAs to be re-used with minimal SNP-specific changes. + */ + svm->sev_es.snp_has_guest_vmsa = true; + + /* Use the new VMSA */ + svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); + + /* Mark the vCPU as runnable */ + vcpu->arch.pv.pv_unhalted = false; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + + /* + * gmem pages aren't currently migratable, but if this ever + * changes then care should be taken to ensure + * svm->sev_es.vmsa is pinned through some other means. + */ + kvm_release_pfn_clean(pfn); + } + + /* + * When replacing the VMSA during SEV-SNP AP creation, + * mark the VMCB dirty so that full state is always reloaded. + */ + vmcb_mark_all_dirty(svm->vmcb); + + return 0; +} + +/* + * Invoked as part of svm_vcpu_reset() processing of an init event. + */ +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int ret; + + if (!sev_snp_guest(vcpu->kvm)) + return; + + mutex_lock(&svm->sev_es.snp_vmsa_mutex); + + if (!svm->sev_es.snp_ap_waiting_for_reset) + goto unlock; + + svm->sev_es.snp_ap_waiting_for_reset = false; + + ret = __sev_snp_update_protected_guest_state(vcpu); + if (ret) + vcpu_unimpl(vcpu, "snp: AP state update on init failed\n"); + +unlock: + mutex_unlock(&svm->sev_es.snp_vmsa_mutex); +} + +static int sev_snp_ap_creation(struct vcpu_svm *svm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_vcpu *target_vcpu; + struct vcpu_svm *target_svm; + unsigned int request; + unsigned int apic_id; + bool kick; + int ret; + + request = lower_32_bits(svm->vmcb->control.exit_info_1); + apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); + + /* Validate the APIC ID */ + target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); + if (!target_vcpu) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", + apic_id); + return -EINVAL; + } + + ret = 0; + + target_svm = to_svm(target_vcpu); + + /* + * The target vCPU is valid, so the vCPU will be kicked unless the + * request is for CREATE_ON_INIT. For any errors at this stage, the + * kick will place the vCPU in an non-runnable state. + */ + kick = true; + + mutex_lock(&target_svm->sev_es.snp_vmsa_mutex); + + target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + target_svm->sev_es.snp_ap_waiting_for_reset = true; + + /* Interrupt injection mode shouldn't change for AP creation */ + if (request < SVM_VMGEXIT_AP_DESTROY) { + u64 sev_features; + + sev_features = vcpu->arch.regs[VCPU_REGS_RAX]; + sev_features ^= sev->vmsa_features; + + if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n", + vcpu->arch.regs[VCPU_REGS_RAX]); + ret = -EINVAL; + goto out; + } + } + + switch (request) { + case SVM_VMGEXIT_AP_CREATE_ON_INIT: + kick = false; + fallthrough; + case SVM_VMGEXIT_AP_CREATE: + if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", + svm->vmcb->control.exit_info_2); + ret = -EINVAL; + goto out; + } + + /* + * Malicious guest can RMPADJUST a large page into VMSA which + * will hit the SNP erratum where the CPU will incorrectly signal + * an RMP violation #PF if a hugepage collides with the RMP entry + * of VMSA page, reject the AP CREATE request if VMSA address from + * guest is 2M aligned. + */ + if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) { + vcpu_unimpl(vcpu, + "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", + svm->vmcb->control.exit_info_2); + ret = -EINVAL; + goto out; + } + + target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; + break; + case SVM_VMGEXIT_AP_DESTROY: + break; + default: + vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", + request); + ret = -EINVAL; + break; + } + +out: + if (kick) { + kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); + + if (target_vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) + kvm_make_request(KVM_REQ_UNBLOCK, target_vcpu); + + kvm_vcpu_kick(target_vcpu); + } + + mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex); + + return ret; +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -4170,6 +4376,15 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); break; + case SVM_VMGEXIT_AP_CREATION: + ret = sev_snp_ap_creation(svm); + if (ret) { + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + } + + ret = 1; + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", @@ -4263,7 +4478,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) * the VMSA will be NULL if this vCPU is the destination for intrahost * migration, and will be copied later. */ - if (svm->sev_es.vmsa) + if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); /* Can't intercept CR register access, HV can't modify CR registers */ @@ -4335,6 +4550,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); + + mutex_init(&svm->sev_es.snp_vmsa_mutex); } void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) @@ -4446,6 +4663,16 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) return p; } +void sev_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + if (!sev_snp_guest(vcpu->kvm)) + return; + + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu) && + vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +} + void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) { struct kvm_memory_slot *slot; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5fe18683afb0..72e3cb8549b2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1442,6 +1442,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; + if (init_event) + sev_snp_init_protected_guest_state(vcpu); + init_vmcb(vcpu); if (!init_event) @@ -5144,6 +5147,12 @@ static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) return page_address(page); } +static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + sev_vcpu_unblocking(vcpu); + avic_vcpu_unblocking(vcpu); +} + static struct kvm_x86_ops svm_x86_ops __initdata = { .name = KBUILD_MODNAME, @@ -5166,7 +5175,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, .vcpu_blocking = avic_vcpu_blocking, - .vcpu_unblocking = avic_vcpu_unblocking, + .vcpu_unblocking = svm_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index b8a83ebb8ecf..b34713c00546 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -224,6 +224,11 @@ struct vcpu_sev_es_state { bool psc_2m; u64 ghcb_registered_gpa; + + struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */ + gpa_t snp_vmsa_gpa; + bool snp_ap_waiting_for_reset; + bool snp_has_guest_vmsa; }; struct vcpu_svm { @@ -740,6 +745,8 @@ int sev_cpu_init(struct svm_cpu_data *sd); int sev_dev_get_attr(u32 group, u64 attr, u64 *val); extern unsigned int max_sev_asid; void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); +void sev_vcpu_unblocking(struct kvm_vcpu *vcpu); +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); #else static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) { return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); @@ -754,6 +761,8 @@ static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } #define max_sev_asid 0 static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} +static inline void sev_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {} #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b73eb950f71..8cfc77568e82 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10883,6 +10883,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); + + if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) { + kvm_vcpu_reset(vcpu, true); + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) { + r = 1; + goto out; + } + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || @@ -13149,6 +13157,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_PMI, vcpu)) return true; + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) + return true; + if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) return true; -- Gitee From 48794a94442fc8095ff349bb58bfc516ed60a79c Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 1 May 2024 03:52:03 -0500 Subject: [PATCH 41/75] KVM: SEV: Implement gmem hook for initializing private pages ANBZ: #34453 commit 4f2e7aa1cfdf4374a4c876c9358e0d7035647eb1 upstream. This will handle the RMP table updates needed to put a page into a private state before mapping it into an SEV-SNP guest. Reviewed-by: Paolo Bonzini Signed-off-by: Michael Roth Message-ID: <20240501085210.2213060-14-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/svm/sev.c | 98 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 2 + arch/x86/kvm/svm/svm.h | 5 +++ arch/x86/kvm/x86.c | 5 +++ virt/kvm/guest_memfd.c | 4 +- 6 files changed, 113 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8c5d9abc0934..75267e8b9b4b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -129,6 +129,7 @@ config KVM_AMD_SEV depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM select KVM_GENERIC_PRIVATE_MEM + select HAVE_KVM_GMEM_PREPARE help Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b899b71c48eb..f75af087d8a1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4764,3 +4764,101 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) out_no_trace: put_page(pfn_to_page(pfn)); } + +static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn = start; + + while (pfn < end) { + int ret, rmp_level; + bool assigned; + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret) { + pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n", + pfn, start, end, rmp_level, ret); + return false; + } + + if (assigned) { + pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n", + __func__, pfn, start, end, rmp_level); + return false; + } + + pfn++; + } + + return true; +} + +static u8 max_level_for_order(int order) +{ + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) +{ + kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + + /* + * If this is a large folio, and the entire 2M range containing the + * PFN is currently shared, then the entire 2M-aligned range can be + * set to private via a single 2M RMP entry. + */ + if (max_level_for_order(order) > PG_LEVEL_4K && + is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) + return true; + + return false; +} + +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + kvm_pfn_t pfn_aligned; + gfn_t gfn_aligned; + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc) { + pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n", + gfn, pfn, rc); + return -ENOENT; + } + + if (assigned) { + pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n", + __func__, gfn, pfn, max_order, level); + return 0; + } + + if (is_large_rmp_possible(kvm, pfn, max_order)) { + level = PG_LEVEL_2M; + pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD); + } else { + level = PG_LEVEL_4K; + pfn_aligned = pfn; + gfn_aligned = gfn; + } + + rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false); + if (rc) { + pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n", + gfn, pfn, level, rc); + return -EINVAL; + } + + pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n", + __func__, gfn, pfn, pfn_aligned, max_order, level); + + return 0; +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 72e3cb8549b2..2243ba15be8b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5289,6 +5289,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .alloc_apic_backing_page = svm_alloc_apic_backing_page, .arch_hypercall = kvm_hygon_arch_hypercall, + + .gmem_prepare = sev_gmem_prepare, }; /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index b34713c00546..7c99e7023cd6 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -747,6 +747,7 @@ extern unsigned int max_sev_asid; void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); void sev_vcpu_unblocking(struct kvm_vcpu *vcpu); void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); #else static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) { return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); @@ -763,6 +764,10 @@ static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXI static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} static inline void sev_vcpu_unblocking(struct kvm_vcpu *vcpu) {} static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {} +static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + return 0; +} #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8cfc77568e82..8e2664fb7b3e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13608,6 +13608,11 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_arch_no_poll); #ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +bool kvm_arch_gmem_prepare_needed(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_SNP_VM; +} + int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) { return static_call(kvm_x86_gmem_prepare)(kvm, pfn, gfn, max_order); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 4c0cbb303e28..3d796d0ec755 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -39,8 +39,8 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol gfn = slot->base_gfn + index - slot->gmem.pgoff; rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page))); if (rc) { - pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx, error %d.\n", - index, rc); + pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", + index, gfn, pfn, rc); return rc; } } -- Gitee From e0175a935d56a19d9213160b3f08265a71d88215 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 1 May 2024 03:52:04 -0500 Subject: [PATCH 42/75] KVM: SEV: Implement gmem hook for invalidating private pages ANBZ: #34453 commit 8eb01900b018b1bf20ce695758f30be71bd1b9ac upstream. Implement a platform hook to do the work of restoring the direct map entries of gmem-managed pages and transitioning the corresponding RMP table entries back to the default shared/hypervisor-owned state. Signed-off-by: Michael Roth Message-ID: <20240501085210.2213060-15-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/svm/sev.c | 64 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/svm/svm.h | 2 ++ 4 files changed, 68 insertions(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 75267e8b9b4b..ae5911168f27 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -130,6 +130,7 @@ config KVM_AMD_SEV select ARCH_HAS_CC_PLATFORM select KVM_GENERIC_PRIVATE_MEM select HAVE_KVM_GMEM_PREPARE + select HAVE_KVM_GMEM_INVALIDATE help Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f75af087d8a1..37f3e240d440 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4862,3 +4862,67 @@ int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) return 0; } + +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn; + + pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); + + for (pfn = start; pfn < end;) { + bool use_2m_update = false; + int rc, rmp_level; + bool assigned; + + rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (WARN_ONCE(rc, "SEV: Failed to retrieve RMP entry for PFN 0x%llx error %d\n", + pfn, rc)) + goto next_pfn; + + if (!assigned) + goto next_pfn; + + use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && + end >= (pfn + PTRS_PER_PMD) && + rmp_level > PG_LEVEL_4K; + + /* + * If an unaligned PFN corresponds to a 2M region assigned as a + * large page in the RMP table, PSMASH the region into individual + * 4K RMP entries before attempting to convert a 4K sub-page. + */ + if (!use_2m_update && rmp_level > PG_LEVEL_4K) { + /* + * This shouldn't fail, but if it does, report it, but + * still try to update RMP entry to shared and pray this + * was a spurious error that can be addressed later. + */ + rc = snp_rmptable_psmash(pfn); + WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n", + pfn, rc); + } + + rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); + if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n", + pfn, rc)) + goto next_pfn; + + /* + * SEV-ES avoids host/guest cache coherency issues through + * WBINVD hooks issued via MMU notifiers during run-time, and + * KVM's VM destroy path at shutdown. Those MMU notifier events + * don't cover gmem since there is no requirement to map pages + * to a HVA in order to use them for a running guest. While the + * shutdown path would still likely cover things for SNP guests, + * userspace may also free gmem pages during run-time via + * hole-punching operations on the guest_memfd, so flush the + * cache entries for these pages before free'ing them back to + * the host. + */ + clflush_cache_range(__va(pfn_to_hpa(pfn)), + use_2m_update ? PMD_SIZE : PAGE_SIZE); +next_pfn: + pfn += use_2m_update ? PTRS_PER_PMD : 1; + cond_resched(); + } +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2243ba15be8b..44fa4c1a09c9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5291,6 +5291,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .arch_hypercall = kvm_hygon_arch_hypercall, .gmem_prepare = sev_gmem_prepare, + .gmem_invalidate = sev_gmem_invalidate, }; /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7c99e7023cd6..b1183e57488b 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -748,6 +748,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); void sev_vcpu_unblocking(struct kvm_vcpu *vcpu); void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #else static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) { return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); @@ -768,6 +769,7 @@ static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in { return 0; } +static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} #endif -- Gitee From 90f3ccdf50dee0a5b67c40abfa4fa01ecb86553f Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 1 May 2024 03:52:05 -0500 Subject: [PATCH 43/75] KVM: x86: Implement hook for determining max NPT mapping level ANBZ: #34453 commit b2104024f40cadd7d357981c51c6437a41d86f63 upstream. In the case of SEV-SNP, whether or not a 2MB page can be mapped via a 2MB mapping in the guest's nested page table depends on whether or not any subpages within the range have already been initialized as private in the RMP table. The existing mixed-attribute tracking in KVM is insufficient here, for instance: - gmem allocates 2MB page - guest issues PVALIDATE on 2MB page - guest later converts a subpage to shared - SNP host code issues PSMASH to split 2MB RMP mapping to 4K - KVM MMU splits NPT mapping to 4K - guest later converts that shared page back to private At this point there are no mixed attributes, and KVM would normally allow for 2MB NPT mappings again, but this is actually not allowed because the RMP table mappings are 4K and cannot be promoted on the hypervisor side, so the NPT mappings must still be limited to 4K to match this. Implement a kvm_x86_ops.private_max_mapping_level() hook for SEV that checks for this condition and adjusts the mapping level accordingly. Reviewed-by: Paolo Bonzini Signed-off-by: Michael Roth Message-ID: <20240501085210.2213060-16-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- arch/x86/kvm/svm/sev.c | 15 +++++++++++++++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/svm/svm.h | 5 +++++ 3 files changed, 21 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 37f3e240d440..350b80e49303 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4926,3 +4926,18 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) cond_resched(); } } + +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc || !assigned) + return PG_LEVEL_4K; + + return level; +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 44fa4c1a09c9..54e2db302d32 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5292,6 +5292,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .gmem_prepare = sev_gmem_prepare, .gmem_invalidate = sev_gmem_invalidate, + .private_max_mapping_level = sev_private_max_mapping_level, }; /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index b1183e57488b..740883a9ee9b 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -749,6 +749,7 @@ void sev_vcpu_unblocking(struct kvm_vcpu *vcpu); void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); #else static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) { return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); @@ -770,6 +771,10 @@ static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in return 0; } static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} +static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + return 0; +} #endif -- Gitee From 3018b0b609cf4dd2be957ac3df5a7afc67ec535b Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Wed, 1 May 2024 03:52:06 -0500 Subject: [PATCH 44/75] KVM: SEV: Avoid WBINVD for HVA-based MMU notifications for SNP ANBZ: #34453 commit ea262f8a7c360e71f3cb6c2151fd9bfcefd090e9 upstream. With SNP/guest_memfd, private/encrypted memory should not be mappable, and MMU notifications for HVA-mapped memory will only be relevant to unencrypted guest memory. Therefore, the rationale behind issuing a wbinvd_on_all_cpus() in sev_guest_memory_reclaimed() should not apply for SNP guests and can be ignored. Signed-off-by: Ashish Kalra Reviewed-by: Paolo Bonzini [mdr: Add some clarifications in commit] Signed-off-by: Michael Roth Message-ID: <20240501085210.2213060-17-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- arch/x86/kvm/svm/sev.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 350b80e49303..d03d7068c676 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3230,7 +3230,13 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) void sev_guest_memory_reclaimed(struct kvm *kvm) { - if (!sev_guest(kvm)) + /* + * With SNP+gmem, private/encrypted memory is unreachable via the + * hva-based mmu notifiers, so these events are only actually + * pertaining to shared pages where there is no need to perform + * the WBINVD to flush associated caches. + */ + if (!sev_guest(kvm) || sev_snp_guest(kvm)) return; wbinvd_on_all_cpus(); -- Gitee From d137f3ced585fa6efb9410a1a8a7c9b307c5aa1c Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 1 May 2024 03:52:07 -0500 Subject: [PATCH 45/75] KVM: SVM: Add module parameter to enable SEV-SNP ANBZ: #34453 commit 6f627b425378915b6eda30908bedefc21b70b8c4 upstream. Add a module parameter than can be used to enable or disable the SEV-SNP feature. Now that KVM contains the support for the SNP set the GHCB hypervisor feature flag to indicate that SNP is supported. Signed-off-by: Brijesh Singh Reviewed-by: Paolo Bonzini Signed-off-by: Ashish Kalra Message-ID: <20240501085210.2213060-18-michael.roth@amd.com> Signed-off-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- arch/x86/kvm/svm/sev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index d03d7068c676..f3b262b606f3 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -52,7 +52,8 @@ static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); /* enable/disable SEV-SNP support */ -static bool sev_snp_enabled; +static bool sev_snp_enabled = true; +module_param_named(sev_snp, sev_snp_enabled, bool, 0444); /* enable/disable SEV-ES DebugSwap support */ static bool sev_es_debug_swap_enabled = true; -- Gitee From 1a7774fe1a5e565132f974b9d0d9d669d459a1d8 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 11 Apr 2023 15:44:46 +0930 Subject: [PATCH 46/75] KVM: PPC: Fix documentation for ppc mmu caps ANBZ: #34453 commit 651d61bc8b7d8bb622cfc24be2ee92eebb4ed3cc upstream. The documentation mentions KVM_CAP_PPC_RADIX_MMU, but the defines in the kvm headers spell it KVM_CAP_PPC_MMU_RADIX. Similarly with KVM_CAP_PPC_MMU_HASH_V3. Fixes: c92701322711 ("KVM: PPC: Book3S HV: Add userspace interfaces for POWER9 MMU") Signed-off-by: Joel Stanley Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://msgid.link/20230411061446.26324-1-joel@jms.id.au Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- Documentation/virt/kvm/api.rst | 8 ++++---- include/uapi/linux/kvm.h | 4 ++-- tools/include/uapi/linux/kvm.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9a9ce86f86f2..2356a9753384 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4312,7 +4312,7 @@ operating system that uses the PIT for timing (e.g. Linux 2.4.x). 4.100 KVM_PPC_CONFIGURE_V3_MMU ------------------------------ -:Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 +:Capability: KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 :Architectures: ppc :Type: vm ioctl :Parameters: struct kvm_ppc_mmuv3_cfg (in) @@ -4346,7 +4346,7 @@ the Power ISA V3.00, Book III section 5.7.6.1. 4.101 KVM_PPC_GET_RMMU_INFO --------------------------- -:Capability: KVM_CAP_PPC_RADIX_MMU +:Capability: KVM_CAP_PPC_MMU_RADIX :Architectures: ppc :Type: vm ioctl :Parameters: struct kvm_ppc_rmmu_info (out) @@ -8082,7 +8082,7 @@ capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this will disable the use of APIC hardware virtualization even if supported by the CPU, as it's incompatible with SynIC auto-EOI behavior. -8.3 KVM_CAP_PPC_RADIX_MMU +8.3 KVM_CAP_PPC_MMU_RADIX ------------------------- :Architectures: ppc @@ -8092,7 +8092,7 @@ available, means that the kernel can support guests using the radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 processor). -8.4 KVM_CAP_PPC_HASH_MMU_V3 +8.4 KVM_CAP_PPC_MMU_HASH_V3 --------------------------- :Architectures: ppc diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index bb53a4d17e2a..2eb1939dad2b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1592,9 +1592,9 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_SPAPR_RESIZE_HPT */ #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) #define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) -/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +/* Available with KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) -/* Available with KVM_CAP_PPC_RADIX_MMU */ +/* Available with KVM_CAP_PPC_MMU_RADIX */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) /* Available with KVM_CAP_PPC_GET_CPU_CHAR */ #define KVM_PPC_GET_CPU_CHAR _IOR(KVMIO, 0xb1, struct kvm_ppc_cpu_char) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 29462d23938f..e86aab0e9fda 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1204,7 +1204,7 @@ struct kvm_vfio_spapr_tce { /* Available with KVM_CAP_SPAPR_RESIZE_HPT */ #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) #define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) -/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_MMU_HASH_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) /* Available with KVM_CAP_PPC_RADIX_MMU */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) -- Gitee From 6e1baf1c4d4ed8cec1be0606238145e2743cd85d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 28 May 2024 16:24:15 -0300 Subject: [PATCH 47/75] tools headers UAPI: Sync kvm headers with the kernel sources ANBZ: #34453 commit 88e520512a68b4e724cb0f4281c79ae28673ccb1 upstream. To pick the changes in: 4af663c2f64a8d25 ("KVM: SEV: Allow per-guest configuration of GHCB protocol version") 4f5defae708992dd ("KVM: SEV: introduce KVM_SEV_INIT2 operation") 26c44aa9e076ed83 ("KVM: SEV: define VM types for SEV and SEV-ES") ac5c48027bacb1b5 ("KVM: SEV: publish supported VMSA features") 651d61bc8b7d8bb6 ("KVM: PPC: Fix documentation for ppc mmu caps") That don't change functionality in tools/perf, as no new ioctl is added for the 'perf trace' scripts to harvest. This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Joel Stanley Cc: Michael Ellerman Cc: Michael Roth Cc: Namhyung Kim Cc: Paolo Bonzini Link: https://lore.kernel.org/lkml/ZlYxAdHjyAkvGtMW@x1 Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Mukesh Ogare Signed-off-by: mohanasv --- tools/arch/x86/include/uapi/asm/kvm.h | 22 ++++++++++++++++++++-- tools/include/uapi/linux/kvm.h | 4 ++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index ef11aa4cab42..9fae1b73b529 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -457,8 +457,13 @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 -/* attributes for system fd (group 0) */ -#define KVM_X86_XCOMP_GUEST_SUPP 0 +/* vendor-independent attributes for system fd (group 0) */ +#define KVM_X86_GRP_SYSTEM 0 +# define KVM_X86_XCOMP_GUEST_SUPP 0 + +/* vendor-specific groups and attributes for system fd */ +#define KVM_X86_GRP_SEV 1 +# define KVM_X86_SEV_VMSA_FEATURES 0 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -689,6 +694,9 @@ enum sev_cmd_id { /* Guest Migration Extension */ KVM_SEV_SEND_CANCEL, + /* Second time is the charm; improved versions of the above ioctls. */ + KVM_SEV_INIT2, + KVM_SEV_NR_MAX, }; @@ -700,6 +708,14 @@ struct kvm_sev_cmd { __u32 sev_fd; }; +struct kvm_sev_init { + __u64 vmsa_features; + __u32 flags; + __u16 ghcb_version; + __u16 pad1; + __u32 pad2[8]; +}; + struct kvm_sev_launch_start { __u32 handle; __u32 policy; @@ -856,5 +872,7 @@ struct kvm_hyperv_eventfd { #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 +#define KVM_X86_SEV_VM 2 +#define KVM_X86_SEV_ES_VM 3 #endif /* _ASM_X86_KVM_H */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index e86aab0e9fda..3c8e90b0abab 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1204,9 +1204,9 @@ struct kvm_vfio_spapr_tce { /* Available with KVM_CAP_SPAPR_RESIZE_HPT */ #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) #define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) -/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_MMU_HASH_V3 */ +/* Available with KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) -/* Available with KVM_CAP_PPC_RADIX_MMU */ +/* Available with KVM_CAP_PPC_MMU_RADIX */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) /* Available with KVM_CAP_PPC_GET_CPU_CHAR */ #define KVM_PPC_GET_CPU_CHAR _IOR(KVMIO, 0xb1, struct kvm_ppc_cpu_char) -- Gitee From 338b86a64feb6072d8a17fc63eec1eed1af1af64 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Tue, 28 May 2024 15:58:09 -0500 Subject: [PATCH 48/75] KVM: SEV: Automatically switch reclaimed pages to shared ANBZ: #34453 commit febff040b1a6781c46b3670e73fbcb707e731e7e upstream. Currently there's a consistent pattern of always calling host_rmp_make_shared() immediately after snp_page_reclaim(), so go ahead and handle it automatically as part of snp_page_reclaim(). Also rename it to kvm_rmp_make_shared() to more easily distinguish it as a KVM-specific variant of the more generic rmp_make_shared() helper. Suggested-by: Sean Christopherson Signed-off-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Gokul K Signed-off-by: mohanasv --- arch/x86/kvm/svm/sev.c | 55 ++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f3b262b606f3..dc0006f5f60e 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -331,6 +331,21 @@ static void sev_decommission(unsigned int handle) sev_guest_decommission(&decommission, NULL); } +/* + * Transition a page to hypervisor-owned/shared state in the RMP table. This + * should not fail under normal conditions, but leak the page should that + * happen since it will no longer be usable by the host due to RMP protections. + */ +static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level) +{ + if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) { + snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); + return -EIO; + } + + return 0; +} + /* * Certain page-states, such as Pre-Guest and Firmware pages (as documented * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be @@ -340,32 +355,25 @@ static void sev_decommission(unsigned int handle) * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they * might not be usable by the host due to being set as immutable or still * being associated with a guest ASID. + * + * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be + * converted back to shared, as the page is no longer usable due to RMP + * protections, and it's infeasible for the guest to continue on. */ -static int snp_page_reclaim(u64 pfn) +static int snp_page_reclaim(struct kvm *kvm, u64 pfn) { struct sev_data_snp_page_reclaim data = {0}; - int err, rc; + int fw_err, rc; data.paddr = __sme_set(pfn << PAGE_SHIFT); - rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); - if (WARN_ONCE(rc, "Failed to reclaim PFN %llx", pfn)) + rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err); + if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) { snp_leak_pages(pfn, 1); + return -EIO; + } - return rc; -} - -/* - * Transition a page to hypervisor-owned/shared state in the RMP table. This - * should not fail under normal conditions, but leak the page should that - * happen since it will no longer be usable by the host due to RMP protections. - */ -static int host_rmp_make_shared(u64 pfn, enum pg_level level) -{ - int rc; - - rc = rmp_make_shared(pfn, level); - if (WARN_ON_ONCE(rc)) - snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); + if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K)) + return -EIO; return rc; } @@ -2381,7 +2389,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf * information to provide information on which CPUID leaves/fields * failed CPUID validation. */ - if (!snp_page_reclaim(pfn + i) && !host_rmp_make_shared(pfn + i, PG_LEVEL_4K) && + if (!snp_page_reclaim(kvm, pfn + i) && sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { void *vaddr = kmap_local_pfn(pfn + i); @@ -2399,7 +2407,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n", __func__, ret, sev_populate_args->fw_error, n_private); for (i = 0; i < n_private; i++) - host_rmp_make_shared(pfn + i, PG_LEVEL_4K); + kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K); return ret; } @@ -2517,8 +2525,7 @@ static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, &data, &argp->error); if (ret) { - if (!snp_page_reclaim(pfn)) - host_rmp_make_shared(pfn, PG_LEVEL_4K); + snp_page_reclaim(kvm, pfn); return ret; } @@ -3260,7 +3267,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) if (sev_snp_guest(vcpu->kvm)) { u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; - if (host_rmp_make_shared(pfn, PG_LEVEL_4K)) + if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) goto skip_vmsa_free; } -- Gitee From 1210d491269f46f0a8a30bc9c985fe9a5e2c95ed Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 3 Jun 2024 12:37:26 -0400 Subject: [PATCH 49/75] KVM: SEV: Don't WARN() if RMP lookup fails when invalidating gmem pages ANBZ: #34453 commit 73137f59246da530c29674a506f83a18fe327946 upstream. The hook only handles cleanup work specific to SNP, e.g. RMP table entries and flushing caches for encrypted guest memory. When run on a non-SNP-enabled host (currently only possible using KVM_X86_SW_PROTECTED_VM, e.g. via KVM selftests), the callback is a noop and will WARN due to the RMP table not being present. It's actually expected in this case that the RMP table wouldn't be present and that the hook should be a noop, so drop the WARN_ONCE(). Reported-by: Sean Christopherson Closes: https://lore.kernel.org/kvm/ZkU3_y0UoPk5yAeK@google.com/ Fixes: 8eb01900b018 ("KVM: SEV: Implement gmem hook for invalidating private pages") Suggested-by: Paolo Bonzini Signed-off-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Gokul K Signed-off-by: mohanasv --- arch/x86/kvm/svm/sev.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index dc0006f5f60e..2c675fb69bb5 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4881,6 +4881,9 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) { kvm_pfn_t pfn; + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return; + pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); for (pfn = start; pfn < end;) { @@ -4889,11 +4892,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) bool assigned; rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); - if (WARN_ONCE(rc, "SEV: Failed to retrieve RMP entry for PFN 0x%llx error %d\n", - pfn, rc)) - goto next_pfn; - - if (!assigned) + if (rc || !assigned) goto next_pfn; use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && -- Gitee From d75c2fa0e62dac4f098727de541696dae6325cba Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 22 May 2024 13:31:58 -0500 Subject: [PATCH 50/75] KVM: SVM: Remove the need to trigger an UNBLOCK event on AP creation ANBZ: #34453 commit b2ec042347fde5df1239d228b4d86ca642944870 upstream. All SNP APs are initially started using the APIC INIT/SIPI sequence in the guest. This sequence moves the AP MP state from KVM_MP_STATE_UNINITIALIZED to KVM_MP_STATE_RUNNABLE, so there is no need to attempt the UNBLOCK. As it is, the UNBLOCK support in SVM is only enabled when AVIC is enabled. When AVIC is disabled, AP creation is still successful. Remove the KVM_REQ_UNBLOCK request from the AP creation code and revert the changes to the vcpu_unblocking() kvm_x86_ops path. Signed-off-by: Tom Lendacky Signed-off-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Gokul K Signed-off-by: mohanasv --- arch/x86/kvm/svm/sev.c | 14 -------------- arch/x86/kvm/svm/svm.c | 8 +------- arch/x86/kvm/svm/svm.h | 2 -- 3 files changed, 1 insertion(+), 23 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 2c675fb69bb5..49527c099a82 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4113,10 +4113,6 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) out: if (kick) { kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); - - if (target_vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) - kvm_make_request(KVM_REQ_UNBLOCK, target_vcpu); - kvm_vcpu_kick(target_vcpu); } @@ -4677,16 +4673,6 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) return p; } -void sev_vcpu_unblocking(struct kvm_vcpu *vcpu) -{ - if (!sev_snp_guest(vcpu->kvm)) - return; - - if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu) && - vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -} - void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) { struct kvm_memory_slot *slot; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 54e2db302d32..2b0ea1ee3a1d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5147,12 +5147,6 @@ static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) return page_address(page); } -static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) -{ - sev_vcpu_unblocking(vcpu); - avic_vcpu_unblocking(vcpu); -} - static struct kvm_x86_ops svm_x86_ops __initdata = { .name = KBUILD_MODNAME, @@ -5175,7 +5169,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, .vcpu_blocking = avic_vcpu_blocking, - .vcpu_unblocking = svm_vcpu_unblocking, + .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 740883a9ee9b..20cbad253c9a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -745,7 +745,6 @@ int sev_cpu_init(struct svm_cpu_data *sd); int sev_dev_get_attr(u32 group, u64 attr, u64 *val); extern unsigned int max_sev_asid; void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); -void sev_vcpu_unblocking(struct kvm_vcpu *vcpu); void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); @@ -764,7 +763,6 @@ static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } #define max_sev_asid 0 static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} -static inline void sev_vcpu_unblocking(struct kvm_vcpu *vcpu) {} static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {} static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) { -- Gitee From 81ad8ed52491bddf9c397855b4f30f0439b613bb Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Fri, 31 May 2024 04:46:42 +0000 Subject: [PATCH 51/75] KVM: SEV-ES: Prevent MSR access post VMSA encryption ANBZ: #34453 commit 27bd5fdc24c0d5d1306f968ef24105c4577242b0 upstream. KVM currently allows userspace to read/write MSRs even after the VMSA is encrypted. This can cause unintentional issues if MSR access has side- effects. For ex, while migrating a guest, userspace could attempt to migrate MSR_IA32_DEBUGCTLMSR and end up unintentionally disabling LBRV on the target. Fix this by preventing access to those MSRs which are context switched via the VMSA, once the VMSA is encrypted. Suggested-by: Sean Christopherson Signed-off-by: Nikunj A Dadhania Signed-off-by: Ravi Bangoria Message-ID: <20240531044644.768-2-ravi.bangoria@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Gokul K Signed-off-by: mohanasv --- arch/x86/kvm/svm/svm.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2b0ea1ee3a1d..1086155451d5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2897,10 +2897,24 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) return 0; } +static bool +sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + return sev_es_guest(vcpu->kvm) && + vcpu->arch.guest_state_protected && + svm_msrpm_offset(msr_info->index) != MSR_INVALID && + !msr_write_intercepted(vcpu, msr_info->index); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); + if (sev_es_prevent_msr_access(vcpu, msr_info)) { + msr_info->data = 0; + return -EINVAL; + } + switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: if (!msr_info->host_initiated && @@ -3063,6 +3077,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; + + if (sev_es_prevent_msr_access(vcpu, msr)) + return -EINVAL; + switch (ecx) { case MSR_AMD64_TSC_RATIO: -- Gitee From c0252a0a2ef83097f56a99b961e3e2ae9563a575 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 5 Jun 2024 11:48:10 +0000 Subject: [PATCH 52/75] KVM: SNP: Fix LBR Virtualization for SNP guest ANBZ: #34453 commit f99b052256f16224687e5947772f0942bff73fc1 upstream. SEV-ES and thus SNP guest mandates LBR Virtualization to be _always_ ON. Although commit b7e4be0a224f ("KVM: SEV-ES: Delegate LBR virtualization to the processor") did the correct change for SEV-ES guests, it missed the SNP. Fix it. Reported-by: Srikanth Aithal Fixes: b7e4be0a224f ("KVM: SEV-ES: Delegate LBR virtualization to the processor") Signed-off-by: Ravi Bangoria Message-ID: <20240605114810.1304-1-ravi.bangoria@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Gokul K Signed-off-by: mohanasv --- arch/x86/kvm/svm/sev.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 49527c099a82..c4c8b83c9edb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2531,6 +2531,14 @@ static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) } svm->vcpu.arch.guest_state_protected = true; + /* + * SEV-ES (and thus SNP) guest mandates LBR Virtualization to + * be _always_ ON. Enable it only after setting + * guest_state_protected because KVM_SET_MSRS allows dynamic + * toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); } return 0; -- Gitee From 78a94310ad275c5089c692e126a8633958be5c31 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 23 May 2024 16:41:02 +0100 Subject: [PATCH 53/75] KVM: selftests: Fix shift of 32 bit unsigned int more than 32 bits ANBZ: #34453 commit d21b3c60d6e3917f7388db6bcc455f01c99ee42b upstream. Currrentl a 32 bit 1u value is being shifted more than 32 bits causing overflow and incorrect checking of bits 32-63. Fix this by using the BIT_ULL macro for shifting bits. Detected by cppcheck: sev_init2_tests.c:108:34: error: Shifting 32-bit value by 63 bits is undefined behaviour [shiftTooManyBits] Fixes: dfc083a181ba ("selftests: kvm: add tests for KVM_SEV_INIT2") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240523154102.2236133-1-colin.i.king@gmail.com Signed-off-by: Sean Christopherson Signed-off-by: Gokul K Signed-off-by: mohanasv --- tools/testing/selftests/kvm/x86_64/sev_init2_tests.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c index 7a4a61be119b..3fb967f40c6a 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c @@ -105,11 +105,11 @@ void test_features(uint32_t vm_type, uint64_t supported_features) int i; for (i = 0; i < 64; i++) { - if (!(supported_features & (1u << i))) + if (!(supported_features & BIT_ULL(i))) test_init2_invalid(vm_type, &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }, "unknown feature"); - else if (KNOWN_FEATURES & (1u << i)) + else if (KNOWN_FEATURES & BIT_ULL(i)) test_init2(vm_type, &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }); } -- Gitee From bd976b3486cba4d81c3c47bb79cbd371fd1056ec Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 4 Jun 2024 12:47:39 -0500 Subject: [PATCH 54/75] crypto: ccp - Fix null pointer dereference in __sev_snp_shutdown_locked ANBZ: #34453 commit 468e3295774d0edce15f4ae475913b5076dd4f40 upstream. Fix a null pointer dereference induced by DEBUG_TEST_DRIVER_REMOVE. Return from __sev_snp_shutdown_locked() if the psp_device or the sev_device structs are not initialized. Without the fix, the driver will produce the following splat: ccp 0000:55:00.5: enabling device (0000 -> 0002) ccp 0000:55:00.5: sev enabled ccp 0000:55:00.5: psp enabled BUG: kernel NULL pointer dereference, address: 00000000000000f0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC NOPTI CPU: 262 PID: 1 Comm: swapper/0 Not tainted 6.9.0-rc1+ #29 RIP: 0010:__sev_snp_shutdown_locked+0x2e/0x150 Code: 00 55 48 89 e5 41 57 41 56 41 54 53 48 83 ec 10 41 89 f7 49 89 fe 65 48 8b 04 25 28 00 00 00 48 89 45 d8 48 8b 05 6a 5a 7f 06 <4c> 8b a0 f0 00 00 00 41 0f b6 9c 24 a2 00 00 00 48 83 fb 02 0f 83 RSP: 0018:ffffb2ea4014b7b8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff9e4acd2e0a28 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffb2ea4014b808 RBP: ffffb2ea4014b7e8 R08: 0000000000000106 R09: 000000000003d9c0 R10: 0000000000000001 R11: ffffffffa39ff070 R12: ffff9e49d40590c8 R13: 0000000000000000 R14: ffffb2ea4014b808 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff9e58b1e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000f0 CR3: 0000000418a3e001 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: ? __die_body+0x6f/0xb0 ? __die+0xcc/0xf0 ? page_fault_oops+0x330/0x3a0 ? save_trace+0x2a5/0x360 ? do_user_addr_fault+0x583/0x630 ? exc_page_fault+0x81/0x120 ? asm_exc_page_fault+0x2b/0x30 ? __sev_snp_shutdown_locked+0x2e/0x150 __sev_firmware_shutdown+0x349/0x5b0 ? pm_runtime_barrier+0x66/0xe0 sev_dev_destroy+0x34/0xb0 psp_dev_destroy+0x27/0x60 sp_destroy+0x39/0x90 sp_pci_remove+0x22/0x60 pci_device_remove+0x4e/0x110 really_probe+0x271/0x4e0 __driver_probe_device+0x8f/0x160 driver_probe_device+0x24/0x120 __driver_attach+0xc7/0x280 ? driver_attach+0x30/0x30 bus_for_each_dev+0x10d/0x130 driver_attach+0x22/0x30 bus_add_driver+0x171/0x2b0 ? unaccepted_memory_init_kdump+0x20/0x20 driver_register+0x67/0x100 __pci_register_driver+0x83/0x90 sp_pci_init+0x22/0x30 sp_mod_init+0x13/0x30 do_one_initcall+0xb8/0x290 ? sched_clock_noinstr+0xd/0x10 ? local_clock_noinstr+0x3e/0x100 ? stack_depot_save_flags+0x21e/0x6a0 ? local_clock+0x1c/0x60 ? stack_depot_save_flags+0x21e/0x6a0 ? sched_clock_noinstr+0xd/0x10 ? local_clock_noinstr+0x3e/0x100 ? __lock_acquire+0xd90/0xe30 ? sched_clock_noinstr+0xd/0x10 ? local_clock_noinstr+0x3e/0x100 ? __create_object+0x66/0x100 ? local_clock+0x1c/0x60 ? __create_object+0x66/0x100 ? parameq+0x1b/0x90 ? parse_one+0x6d/0x1d0 ? parse_args+0xd7/0x1f0 ? do_initcall_level+0x180/0x180 do_initcall_level+0xb0/0x180 do_initcalls+0x60/0xa0 ? kernel_init+0x1f/0x1d0 do_basic_setup+0x41/0x50 kernel_init_freeable+0x1ac/0x230 ? rest_init+0x1f0/0x1f0 kernel_init+0x1f/0x1d0 ? rest_init+0x1f0/0x1f0 ret_from_fork+0x3d/0x50 ? rest_init+0x1f0/0x1f0 ret_from_fork_asm+0x11/0x20 Modules linked in: CR2: 00000000000000f0 ---[ end trace 0000000000000000 ]--- RIP: 0010:__sev_snp_shutdown_locked+0x2e/0x150 Code: 00 55 48 89 e5 41 57 41 56 41 54 53 48 83 ec 10 41 89 f7 49 89 fe 65 48 8b 04 25 28 00 00 00 48 89 45 d8 48 8b 05 6a 5a 7f 06 <4c> 8b a0 f0 00 00 00 41 0f b6 9c 24 a2 00 00 00 48 83 fb 02 0f 83 RSP: 0018:ffffb2ea4014b7b8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff9e4acd2e0a28 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffb2ea4014b808 RBP: ffffb2ea4014b7e8 R08: 0000000000000106 R09: 000000000003d9c0 R10: 0000000000000001 R11: ffffffffa39ff070 R12: ffff9e49d40590c8 R13: 0000000000000000 R14: ffffb2ea4014b808 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff9e58b1e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000f0 CR3: 0000000418a3e001 CR4: 0000000000770ef0 PKRU: 55555554 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x1fc00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Fixes: 1ca5614b84ee ("crypto: ccp: Add support to initialize the AMD-SP for SEV-SNP") Cc: stable@vger.kernel.org Signed-off-by: Kim Phillips Reviewed-by: Liam Merwick Reviewed-by: Mario Limonciello Reviewed-by: John Allen Reviewed-by: Tom Lendacky Signed-off-by: Herbert Xu Signed-off-by: Gokul K Signed-off-by: mohanasv --- drivers/crypto/ccp/sev-dev.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 8e7024807dc6..3246bed16918 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -1707,10 +1707,16 @@ static int sev_update_firmware(struct device *dev) static int __sev_snp_shutdown_locked(int *error, bool panic) { - struct sev_device *sev = psp_master->sev_data; + struct psp_device *psp = psp_master; + struct sev_device *sev; struct sev_data_snp_shutdown_ex data; int ret; + if (!psp || !psp->sev_data) + return 0; + + sev = psp->sev_data; + if (!sev->snp_initialized) return 0; -- Gitee From 899d810f9404a4e03eb7a9f4c88e2ec921d4fd75 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 11 Jun 2024 04:22:18 -0400 Subject: [PATCH 55/75] virt: guest_memfd: fix reference leak on hwpoisoned page ANBZ: #34453 commit c31745d2c508796a0996c88bf2e55f552d513f65 upstream. If kvm_gmem_get_pfn() detects an hwpoisoned page, it returns -EHWPOISON but it does not put back the reference that kvm_gmem_get_folio() had grabbed. Add the forgotten folio_put(). [Backport Changes] 1. In virt/kvm/guest_memfd.c, within __kvm_gmem_get_pfn(), the if block (folio_test_hwpoison(folio)) was updated based on latest upstream merge tag 02b0d3b9d4d. Where, the folio is unlocked and put, followed by a direct return of -EHWPOISON. Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Cc: stable@vger.kernel.org Reviewed-by: Liam Merwick Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 3d796d0ec755..785dd00511e6 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -571,8 +571,9 @@ static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, return PTR_ERR(folio); if (folio_test_hwpoison(folio)) { - r = -EHWPOISON; - goto out_unlock; + folio_unlock(folio); + folio_put(folio); + return -EHWPOISON; } page = folio_file_page(folio, index); @@ -583,7 +584,6 @@ static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, r = 0; -out_unlock: folio_unlock(folio); return r; -- Gitee From e4f811c118436305de96a1c3db6d9a2d7201ea9b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 7 Jun 2024 18:10:29 +0200 Subject: [PATCH 56/75] KVM: interrupt kvm_gmem_populate() on signals ANBZ: #34453 commit d81473840ce12fe682c19af9b4b5b39a9fe5a4c9 upstream. kvm_gmem_populate() is a potentially lengthy operation that can involve multiple calls to the firmware. Interrupt it if a signal arrives. Fixes: 1f6c06b177513 ("KVM: guest_memfd: Add interface for populating gmem pages with user data") Cc: Isaku Yamahata Cc: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 785dd00511e6..f2746c29048d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -633,6 +633,11 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long gfn_t gfn = start_gfn + i; kvm_pfn_t pfn; + if (signal_pending(current)) { + ret = -EINTR; + break; + } + ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false); if (ret) break; -- Gitee From 97c66ad654e73f388a890b57f5ebfec0eff53c59 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Tue, 4 Jun 2024 18:35:10 -0500 Subject: [PATCH 57/75] KVM: SEV-ES: Fix svm_get_msr()/svm_set_msr() for KVM_SEV_ES_INIT guests ANBZ: #34453 commit cf6d9d2d243f242f51ee0666ca88e61d9408752f upstream. With commit 27bd5fdc24c0 ("KVM: SEV-ES: Prevent MSR access post VMSA encryption"), older VMMs like QEMU 9.0 and older will fail when booting SEV-ES guests with something like the following error: qemu-system-x86_64: error: failed to get MSR 0x174 qemu-system-x86_64: ../qemu.git/target/i386/kvm/kvm.c:3950: kvm_get_msrs: Assertion `ret == cpu->kvm_msr_buf->nmsrs' failed. This is because older VMMs that might still call svm_get_msr()/svm_set_msr() for SEV-ES guests after guest boot even if those interfaces were essentially just noops because of the vCPU state being encrypted and stored separately in the VMSA. Now those VMMs will get an -EINVAL and generally crash. Newer VMMs that are aware of KVM_SEV_INIT2 however are already aware of the stricter limitations of what vCPU state can be sync'd during guest run-time, so newer QEMU for instance will work both for legacy KVM_SEV_ES_INIT interface as well as KVM_SEV_INIT2. So when using KVM_SEV_INIT2 it's okay to assume userspace can deal with -EINVAL, whereas for legacy KVM_SEV_ES_INIT the kernel might be dealing with either an older VMM and so it needs to assume that returning -EINVAL might break the VMM. Address this by only returning -EINVAL if the guest was started with KVM_SEV_INIT2. Otherwise, just silently return. Cc: Ravi Bangoria Cc: Nikunj A Dadhania Reported-by: Srikanth Aithal Closes: https://lore.kernel.org/lkml/37usuu4yu4ok7be2hqexhmcyopluuiqj3k266z4gajc2rcj4yo@eujb23qc3zcm/ Fixes: 27bd5fdc24c0 ("KVM: SEV-ES: Prevent MSR access post VMSA encryption") Signed-off-by: Michael Roth Message-ID: <20240604233510.764949-1-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kvm/svm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1086155451d5..fdeb4aa42a34 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2912,7 +2912,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (sev_es_prevent_msr_access(vcpu, msr_info)) { msr_info->data = 0; - return -EINVAL; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; } switch (msr_info->index) { @@ -3079,7 +3079,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u64 data = msr->data; if (sev_es_prevent_msr_access(vcpu, msr)) - return -EINVAL; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; switch (ecx) { case MSR_AMD64_TSC_RATIO: -- Gitee From 3ff83d740c4a01afc4ff9064eda6e2e1d6ef5849 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 21 Jun 2024 10:42:05 -0500 Subject: [PATCH 58/75] x86/sev: Do RMP memory coverage check after max_pfn has been set ANBZ: #34453 commit 0440feb090790c6243bca85d6a794824e71ff26c upstream. The RMP table is probed early in the boot process before max_pfn has been set, so the logic to check if the RMP covers all of system memory is not valid. Move the RMP memory coverage check from snp_probe_rmptable_info() into snp_rmptable_init(), which is well after max_pfn has been set. Also, fix the calculation to use PFN_UP instead of PHYS_PFN, in order to compute the required RMP size properly. Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/bec4364c7e34358cc576f01bb197a7796a109169.1718984524.git.thomas.lendacky@amd.com Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/virt/svm/sev.c | 44 ++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 0ae10535c699..0ce17766c0e5 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -120,7 +120,7 @@ static __init void snp_enable(void *arg) bool snp_probe_rmptable_info(void) { - u64 max_rmp_pfn, calc_rmp_sz, rmp_sz, rmp_base, rmp_end; + u64 rmp_sz, rmp_base, rmp_end; rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); rdmsrl(MSR_AMD64_RMP_END, rmp_end); @@ -137,28 +137,11 @@ bool snp_probe_rmptable_info(void) rmp_sz = rmp_end - rmp_base + 1; - /* - * Calculate the amount the memory that must be reserved by the BIOS to - * address the whole RAM, including the bookkeeping area. The RMP itself - * must also be covered. - */ - max_rmp_pfn = max_pfn; - if (PHYS_PFN(rmp_end) > max_pfn) - max_rmp_pfn = PHYS_PFN(rmp_end); - - calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ; - - if (calc_rmp_sz > rmp_sz) { - pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", - calc_rmp_sz, rmp_sz); - return false; - } - probed_rmp_base = rmp_base; probed_rmp_size = rmp_sz; pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n", - probed_rmp_base, probed_rmp_base + probed_rmp_size - 1); + rmp_base, rmp_end); return true; } @@ -206,9 +189,8 @@ void __init snp_fixup_e820_tables(void) */ static int __init snp_rmptable_init(void) { + u64 max_rmp_pfn, calc_rmp_sz, rmptable_size, rmp_end, val; void *rmptable_start; - u64 rmptable_size; - u64 val; if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; @@ -219,10 +201,28 @@ static int __init snp_rmptable_init(void) if (!probed_rmp_size) goto nosnp; + rmp_end = probed_rmp_base + probed_rmp_size - 1; + + /* + * Calculate the amount the memory that must be reserved by the BIOS to + * address the whole RAM, including the bookkeeping area. The RMP itself + * must also be covered. + */ + max_rmp_pfn = max_pfn; + if (PFN_UP(rmp_end) > max_pfn) + max_rmp_pfn = PFN_UP(rmp_end); + + calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ; + if (calc_rmp_sz > probed_rmp_size) { + pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", + calc_rmp_sz, probed_rmp_size); + goto nosnp; + } + rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB); if (!rmptable_start) { pr_err("Failed to map RMP table\n"); - return 1; + goto nosnp; } /* -- Gitee From b03cdb0ce10ddf687a43c4ddaa1f2517ae9b2a2f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 13:56:54 -0400 Subject: [PATCH 59/75] mm, virt: merge AS_UNMOVABLE and AS_INACCESSIBLE ANBZ: #34453 commit 27e6a24a4cf3d25421c0f6ebb7c39f45fc14d20f upstream. The flags AS_UNMOVABLE and AS_INACCESSIBLE were both added just for guest_memfd; AS_UNMOVABLE is already in existing versions of Linux, while AS_INACCESSIBLE was acked for inclusion in 6.11. But really, they are the same thing: only guest_memfd uses them, at least for now, and guest_memfd pages are unmovable because they should not be accessed by the CPU. So merge them into one; use the AS_INACCESSIBLE name which is more comprehensive. At the same time, this fixes an embarrassing bug where AS_INACCESSIBLE was used as a bit mask, despite it being just a bit index. The bug was mostly benign, because AS_INACCESSIBLE's bit representation (1010) corresponded to setting AS_UNEVICTABLE (which is already set) and AS_ENOSPC (except no async writes can happen on the guest_memfd). So the AS_INACCESSIBLE flag simply had no effect. [Backport Changes] 1. In file include/linux/pagemap.h, the `mapping_flags` enum in the Anolis source code was already updated to match the latest upstream changes (from kernel 6.12-rc1 and above) in Anolis commit aab3f3062fc406. therefore , Adding the AS_INACCESSIBLE in place of AS_UNMOVABLE flag at position 8. To maintain consistency with the existing source code and avoid future conflicts. Fixes: 1d23040caa8b ("KVM: guest_memfd: Use AS_INACCESSIBLE when creating guest_memfd inode") Fixes: c72ceafbd12c ("mm: Introduce AS_INACCESSIBLE for encrypted/confidential memory") Cc: linux-mm@kvack.org Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Tested-by: Michael Roth Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- include/linux/pagemap.h | 14 +++++++------- mm/compaction.c | 12 ++++++------ mm/migrate.c | 2 +- mm/truncate.c | 2 +- virt/kvm/guest_memfd.c | 3 +-- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3ca97366bfc6..66d9b2be14e4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -214,9 +214,9 @@ enum mapping_flags { AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ AS_STABLE_WRITES = 7, /* must wait for writeback before modifying folio contents */ - AS_UNMOVABLE = 8, /* The mapping cannot be moved, ever */ + AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping, + including to move the mapping */ AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, - AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping */ AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, @@ -330,20 +330,20 @@ static inline void mapping_clear_stable_writes(struct address_space *mapping) clear_bit(AS_STABLE_WRITES, &mapping->flags); } -static inline void mapping_set_unmovable(struct address_space *mapping) +static inline void mapping_set_inaccessible(struct address_space *mapping) { /* - * It's expected unmovable mappings are also unevictable. Compaction + * It's expected inaccessible mappings are also unevictable. Compaction * migrate scanner (isolate_migratepages_block()) relies on this to * reduce page locking. */ set_bit(AS_UNEVICTABLE, &mapping->flags); - set_bit(AS_UNMOVABLE, &mapping->flags); + set_bit(AS_INACCESSIBLE, &mapping->flags); } -static inline bool mapping_unmovable(struct address_space *mapping) +static inline bool mapping_inaccessible(struct address_space *mapping) { - return test_bit(AS_UNMOVABLE, &mapping->flags); + return test_bit(AS_INACCESSIBLE, &mapping->flags); } static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_space *mapping) diff --git a/mm/compaction.c b/mm/compaction.c index d452a383307c..860f7bb09922 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1174,22 +1174,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) || (mapping && is_unevictable)) { bool migrate_dirty = true; - bool is_unmovable; + bool is_inaccessible; /* * Only folios without mappings or that have * a ->migrate_folio callback are possible to migrate * without blocking. * - * Folios from unmovable mappings are not migratable. + * Folios from inaccessible mappings are not migratable. * * However, we can be racing with truncation, which can * free the mapping that we need to check. Truncation * holds the folio lock until after the folio is removed * from the page so holding it ourselves is sufficient. * - * To avoid locking the folio just to check unmovable, - * assume every unmovable folio is also unevictable, + * To avoid locking the folio just to check inaccessible, + * assume every inaccessible folio is also unevictable, * which is a cheaper test. If our assumption goes * wrong, it's not a correctness bug, just potentially * wasted cycles. @@ -1202,9 +1202,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, migrate_dirty = !mapping || mapping->a_ops->migrate_folio; } - is_unmovable = mapping && mapping_unmovable(mapping); + is_inaccessible = mapping && mapping_inaccessible(mapping); folio_unlock(folio); - if (!migrate_dirty || is_unmovable) + if (!migrate_dirty || is_inaccessible) goto isolate_fail_put; } diff --git a/mm/migrate.c b/mm/migrate.c index bc24b8a7cc4a..e91e7fea1a96 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1008,7 +1008,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, if (!mapping) rc = migrate_folio(mapping, dst, src, mode); - else if (mapping_unmovable(mapping)) + else if (mapping_inaccessible(mapping)) rc = -EOPNOTSUPP; else if (mapping->a_ops->migrate_folio) /* diff --git a/mm/truncate.c b/mm/truncate.c index 3f2451649fa0..d3ea61ac81b5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -259,7 +259,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) * doing a complex calculation here, and then doing the zeroing * anyway if the page split fails. */ - if (!(folio->mapping->flags & AS_INACCESSIBLE)) + if (!mapping_inaccessible(folio->mapping)) folio_zero_range(folio, offset, length); if (folio_needs_release(folio)) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index f2746c29048d..ef14e87a328b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -420,11 +420,10 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; inode->i_mapping->a_ops = &kvm_gmem_aops; - inode->i_mapping->flags |= AS_INACCESSIBLE; inode->i_mode |= S_IFREG; inode->i_size = size; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); - mapping_set_unmovable(inode->i_mapping); + mapping_set_inaccessible(inode->i_mapping); /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); -- Gitee From dd85df543a924731f24af7e27dfd0e1e4acf4b6e Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 10 Apr 2024 15:07:27 -0700 Subject: [PATCH 60/75] KVM: Document KVM_PRE_FAULT_MEMORY ioctl ANBZ: #34453 commit 9aed7a6c0b591801177c90137df43f244e2af9bb upstream. Adds documentation of KVM_PRE_FAULT_MEMORY ioctl. [1] It populates guest memory. It doesn't do extra operations on the underlying technology-specific initialization [2]. For example, CoCo-related operations won't be performed. Concretely for TDX, this API won't invoke TDH.MEM.PAGE.ADD() or TDH.MR.EXTEND(). Vendor-specific APIs are required for such operations. The key point is to adapt of vcpu ioctl instead of VM ioctl. First, populating guest memory requires vcpu. If it is VM ioctl, we need to pick one vcpu somehow. Secondly, vcpu ioctl allows each vcpu to invoke this ioctl in parallel. It helps to scale regarding guest memory size, e.g., hundreds of GB. [1] https://lore.kernel.org/kvm/Zbrj5WKVgMsUFDtb@google.com/ [2] https://lore.kernel.org/kvm/Ze-TJh0BBOWm9spT@google.com/ Suggested-by: Sean Christopherson Signed-off-by: Isaku Yamahata Message-ID: <9a060293c9ad9a78f1d8994cfe1311e818e99257.1712785629.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- Documentation/virt/kvm/api.rst | 55 ++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 2356a9753384..906a77d4b4af 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6317,6 +6317,61 @@ a single guest_memfd file, but the bound ranges must not overlap). See KVM_SET_USER_MEMORY_REGION2 for additional details. +4.143 KVM_PRE_FAULT_MEMORY +------------------------ + +:Capability: KVM_CAP_PRE_FAULT_MEMORY +:Architectures: none +:Type: vcpu ioctl +:Parameters: struct kvm_pre_fault_memory (in/out) +:Returns: 0 if at least one page is processed, < 0 on error + +Errors: + + ========== =============================================================== + EINVAL The specified `gpa` and `size` were invalid (e.g. not + page aligned, causes an overflow, or size is zero). + ENOENT The specified `gpa` is outside defined memslots. + EINTR An unmasked signal is pending and no page was processed. + EFAULT The parameter address was invalid. + EOPNOTSUPP Mapping memory for a GPA is unsupported by the + hypervisor, and/or for the current vCPU state/mode. + EIO unexpected error conditions (also causes a WARN) + ========== =============================================================== + +:: + + struct kvm_pre_fault_memory { + /* in/out */ + __u64 gpa; + __u64 size; + /* in */ + __u64 flags; + __u64 padding[5]; + }; + +KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory +for the current vCPU state. KVM maps memory as if the vCPU generated a +stage-2 read page fault, e.g. faults in memory as needed, but doesn't break +CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed. + +In some cases, multiple vCPUs might share the page tables. In this +case, the ioctl can be called in parallel. + +When the ioctl returns, the input values are updated to point to the +remaining range. If `size` > 0 on return, the caller can just issue +the ioctl again with the same `struct kvm_map_memory` argument. + +Shadow page tables cannot support this ioctl because they +are indexed by virtual address or nested guest physical address. +Calling this ioctl when the guest is using shadow page tables (for +example because it is running a nested guest with nested page tables) +will fail with `EOPNOTSUPP` even if `KVM_CHECK_EXTENSION` reports +the capability to be present. + +`flags` must currently be zero. + + 5. The kvm_run structure ======================== -- Gitee From 1ae780a6f1e26b3852906fb848e630eb7391d72d Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 10 Apr 2024 15:07:28 -0700 Subject: [PATCH 61/75] KVM: Add KVM_PRE_FAULT_MEMORY vcpu ioctl to pre-populate guest memory ANBZ: #34453 commit bc1a5cd002116552db4c3541e91f8a5b1b0cf65d upstream. Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the memory range and calls the arch-specific function. The implementation is optional and enabled by a Kconfig symbol. Suggested-by: Sean Christopherson Signed-off-by: Isaku Yamahata Reviewed-by: Rick Edgecombe Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- include/linux/kvm_host.h | 5 ++++ include/uapi/linux/kvm.h | 10 +++++++ virt/kvm/Kconfig | 3 ++ virt/kvm/kvm_main.c | 60 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 85858242bd0d..3c998c5def59 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2518,4 +2518,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #endif +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range); +#endif + #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2eb1939dad2b..affdd59749dc 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1233,6 +1233,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_PRE_FAULT_MEMORY 236 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 #define KVM_CAP_RISCV_MP_STATE_RESET 242 @@ -2225,4 +2226,13 @@ struct kvm_create_guest_memfd { __u64 reserved[6]; }; +#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) + +struct kvm_pre_fault_memory { + __u64 gpa; + __u64 size; + __u64 flags; + __u64 padding[5]; +}; + #endif /* __LINUX_KVM_H */ diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 0521e0782c5a..a06a766b65b4 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -68,6 +68,9 @@ config HAVE_KVM_INVALID_WAKEUPS config KVM_GENERIC_DIRTYLOG_READ_PROTECT bool +config KVM_GENERIC_PRE_FAULT_MEMORY + bool + config KVM_COMPAT def_bool y depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bcd8b7639f21..0e2f3baa5eb0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4407,6 +4407,52 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) return fd; } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + int idx; + long r; + u64 full_size; + + if (range->flags) + return -EINVAL; + + if (!PAGE_ALIGNED(range->gpa) || + !PAGE_ALIGNED(range->size) || + range->gpa + range->size <= range->gpa) + return -EINVAL; + + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + full_size = range->size; + do { + if (signal_pending(current)) { + r = -EINTR; + break; + } + + r = kvm_arch_vcpu_pre_fault_memory(vcpu, range); + if (WARN_ON_ONCE(r == 0 || r == -EIO)) + break; + + if (r < 0) + break; + + range->size -= r; + range->gpa += r; + cond_resched(); + } while (range->size); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu_put(vcpu); + + /* Return success if at least one page was mapped successfully. */ + return full_size == range->size ? r : 0; +} +#endif + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4611,6 +4657,20 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_get_stats_fd(vcpu); break; } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY + case KVM_PRE_FAULT_MEMORY: { + struct kvm_pre_fault_memory range; + + r = -EFAULT; + if (copy_from_user(&range, argp, sizeof(range))) + break; + r = kvm_vcpu_pre_fault_memory(vcpu, &range); + /* Pass back leftover range. */ + if (copy_to_user(argp, &range, sizeof(range))) + r = -EFAULT; + break; + } +#endif default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } -- Gitee From e7a637a04dbb5624adfb6f2fb3d85782c29f9bd6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Jun 2024 12:59:06 -0700 Subject: [PATCH 62/75] KVM: x86/mmu: Account pf_{fixed,emulate,spurious} in callers of "do page fault" ANBZ: #34453 commit f5e7f00cf1950c771987778444a6600b5f2d883a upstream. Move the accounting of the result of kvm_mmu_do_page_fault() to its callers, as only pf_fixed is common to guest page faults and async #PFs, and upcoming support KVM_PRE_FAULT_MEMORY won't bump _any_ stats. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kvm/mmu/mmu.c | 19 ++++++++++++++++++- arch/x86/kvm/mmu/mmu_internal.h | 13 ------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index dcacb37a95d1..1e1c9a2c0433 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4299,7 +4299,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL); + r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, + true, NULL); + + /* + * Account fixed page faults, otherwise they'll never be counted, but + * ignore stats for all other return times. Page-ready "faults" aren't + * truly spurious and never trigger emulation + */ + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; } static inline u8 kvm_max_level_for_order(int order) @@ -5873,6 +5882,14 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (r < 0) return r; + + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; + else if (r == RET_PF_EMULATE) + vcpu->stat.pf_emulate++; + else if (r == RET_PF_SPURIOUS) + vcpu->stat.pf_spurious++; + if (r != RET_PF_EMULATE) return 1; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 592315bfdf5d..bb23d8a0184f 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -323,19 +323,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; - /* - * Similar to above, prefetch faults aren't truly spurious, and the - * async #PF path doesn't do emulation. Do count faults that are fixed - * by the async #PF handler though, otherwise they'll never be counted. - */ - if (r == RET_PF_FIXED) - vcpu->stat.pf_fixed++; - else if (prefetch) - ; - else if (r == RET_PF_EMULATE) - vcpu->stat.pf_emulate++; - else if (r == RET_PF_SPURIOUS) - vcpu->stat.pf_spurious++; return r; } -- Gitee From d798bc04427928cc2617be0679a14c1d09fcfab4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 10 Jul 2024 10:27:10 -0400 Subject: [PATCH 63/75] KVM: x86/mmu: Make kvm_mmu_do_page_fault() return mapped level ANBZ: #34453 commit 58ef24699bcddfe3de0963c8f74ccf641ffe87f0 upstream. The guest memory population logic will need to know what page size or level (4K, 2M, ...) is mapped. Signed-off-by: Isaku Yamahata Message-ID: Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/mmu_internal.h | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1e1c9a2c0433..761d2099b9c0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4300,7 +4300,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) return; r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, - true, NULL); + true, NULL, NULL); /* * Account fixed page faults, otherwise they'll never be counted, but @@ -5875,7 +5875,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, - &emulation_type); + &emulation_type, NULL); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bb23d8a0184f..0ead070b9cc8 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -280,7 +280,8 @@ enum { }; static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u64 err, bool prefetch, int *emulation_type) + u64 err, bool prefetch, + int *emulation_type, u8 *level) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, @@ -322,6 +323,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; + if (level) + *level = fault.goal_level; return r; } -- Gitee From 2d88b7cb73e45e663ef7b4f724814c4228f66a09 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 11 Jun 2024 03:57:58 -0400 Subject: [PATCH 64/75] KVM: x86: Implement kvm_arch_vcpu_pre_fault_memory() ANBZ: #34453 commit 6e01b7601dfed61bcccd8c386c5084fc0d53d20b upstream. Wire KVM_PRE_FAULT_MEMORY ioctl to kvm_mmu_do_page_fault() to populate guest memory. It can be called right after KVM_CREATE_VCPU creates a vCPU, since at that point kvm_mmu_create() and kvm_init_mmu() are called and the vCPU is ready to invoke the KVM page fault handler. The helper function kvm_tdp_map_page() takes care of the logic to process RET_PF_* return values and convert them to success or errno. Signed-off-by: Isaku Yamahata Message-ID: <9b866a0ae7147f96571c439e75429a03dcb659b6.1712785629.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/mmu/mmu.c | 73 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 3 ++ 3 files changed, 77 insertions(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ae5911168f27..d682a04ce279 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -49,6 +49,7 @@ config KVM select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING + select KVM_GENERIC_PRE_FAULT_MEMORY help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 761d2099b9c0..13ea28dababd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4657,6 +4657,79 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } +static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, + u8 *level) +{ + int r; + + /* + * Restrict to TDP page fault, since that's the only case where the MMU + * is indexed by GPA. + */ + if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault) + return -EOPNOTSUPP; + + do { + if (signal_pending(current)) + return -EINTR; + cond_resched(); + r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); + } while (r == RET_PF_RETRY); + + if (r < 0) + return r; + + switch (r) { + case RET_PF_FIXED: + case RET_PF_SPURIOUS: + return 0; + + case RET_PF_EMULATE: + return -ENOENT; + + case RET_PF_RETRY: + case RET_PF_CONTINUE: + case RET_PF_INVALID: + default: + WARN_ONCE(1, "could not fix page fault during prefault"); + return -EIO; + } +} + +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + u64 error_code = PFERR_GUEST_FINAL_MASK; + u8 level = PG_LEVEL_4K; + u64 end; + int r; + + /* + * reload is efficient when called repeatedly, so we can do it on + * every iteration. + */ + kvm_mmu_reload(vcpu); + + if (kvm_arch_has_private_mem(vcpu->kvm) && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + + /* + * Shadow paging uses GVA for kvm page fault, so restrict to + * two-dimensional paging. + */ + r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); + if (r < 0) + return r; + + /* + * If the mapping that covers range->gpa can use a huge page, it + * may start below it or end after range->gpa + range->size. + */ + end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); + return min(range->size, end - range->gpa); +} + static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e2664fb7b3e..8c9aae217a4b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4568,6 +4568,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MEMORY_FAULT_INFO: r = 1; break; + case KVM_CAP_PRE_FAULT_MEMORY: + r = tdp_enabled; + break; case KVM_CAP_EXIT_HYPERCALL: r = KVM_EXIT_HYPERCALL_VALID_MASK; break; -- Gitee From f5ac25eae6cddb05c643661808053be5bff7e16c Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 1 Jul 2024 17:31:46 -0500 Subject: [PATCH 65/75] KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event ANBZ: #34453 commit 88caf544c9305313e1c48ac1a437faa5df8fff06 upstream. Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra Reviewed-by: Tom Lendacky Reviewed-by: Liam Merwick [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kvm/svm/sev.c | 134 +++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.h | 3 + include/uapi/linux/sev-guest.h | 3 + 3 files changed, 140 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c4c8b83c9edb..f3ba68484419 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -395,6 +396,78 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) sev_decommission(handle); } +/* + * This sets up bounce buffers/firmware pages to handle SNP Guest Request + * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB + * 2.0 specification for more details. + * + * Technically, when an SNP Guest Request is issued, the guest will provide its + * own request/response pages, which could in theory be passed along directly + * to firmware rather than using bounce pages. However, these pages would need + * special care: + * + * - Both pages are from shared guest memory, so they need to be protected + * from migration/etc. occurring while firmware reads/writes to them. At a + * minimum, this requires elevating the ref counts and potentially needing + * an explicit pinning of the memory. This places additional restrictions + * on what type of memory backends userspace can use for shared guest + * memory since there is some reliance on using refcounted pages. + * + * - The response page needs to be switched to Firmware-owned[1] state + * before the firmware can write to it, which can lead to potential + * host RMP #PFs if the guest is misbehaved and hands the host a + * guest page that KVM might write to for other reasons (e.g. virtio + * buffers/etc.). + * + * Both of these issues can be avoided completely by using separately-allocated + * bounce pages for both the request/response pages and passing those to + * firmware instead. So that's what is being set up here. + * + * Guest requests rely on message sequence numbers to ensure requests are + * issued to firmware in the order the guest issues them, so concurrent guest + * requests generally shouldn't happen. But a misbehaved guest could issue + * concurrent guest requests in theory, so a mutex is used to serialize + * access to the bounce buffers. + * + * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more + * details on Firmware-owned pages, along with "RMP and VMPL Access Checks" + * in the APM for details on the related RMP restrictions. + */ +static int snp_guest_req_init(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct page *req_page; + + req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!req_page) + return -ENOMEM; + + sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sev->guest_resp_buf) { + __free_page(req_page); + return -EIO; + } + + sev->guest_req_buf = page_address(req_page); + mutex_init(&sev->guest_req_mutex); + + return 0; +} + +static void snp_guest_req_cleanup(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + + if (sev->guest_resp_buf) + snp_free_firmware_page(sev->guest_resp_buf); + + if (sev->guest_req_buf) + __free_page(virt_to_page(sev->guest_req_buf)); + + sev->guest_req_buf = NULL; + sev->guest_resp_buf = NULL; +} + static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_init *data, unsigned long vm_type) @@ -486,6 +559,10 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (ret) goto e_free; + /* This needs to happen after SEV/SNP firmware initialization. */ + if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm)) + goto e_free; + INIT_LIST_HEAD(&sev->regions_list); INIT_LIST_HEAD(&sev->mirror_vms); sev->need_init = false; @@ -2980,6 +3057,8 @@ void sev_vm_destroy(struct kvm *kvm) } if (sev_snp_guest(kvm)) { + snp_guest_req_cleanup(kvm); + /* * Decomission handles unbinding of the ASID. If it fails for * some unexpected reason, just leak the ASID. @@ -3504,6 +3583,13 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; + case SVM_VMGEXIT_GUEST_REQUEST: + if (!sev_snp_guest(vcpu->kvm) || + !PAGE_ALIGNED(control->exit_info_1) || + !PAGE_ALIGNED(control->exit_info_2) || + control->exit_info_1 == control->exit_info_2) + goto vmgexit_err; + break; default: reason = GHCB_ERR_INVALID_EVENT; goto vmgexit_err; @@ -4129,6 +4215,51 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) return ret; } +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct sev_data_snp_guest_request data = {0}; + struct kvm *kvm = svm->vcpu.kvm; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + sev_ret_code fw_err = 0; + int ret; + + if (!sev_snp_guest(kvm)) + return -EINVAL; + + mutex_lock(&sev->guest_req_mutex); + + if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) { + ret = -EIO; + goto out_unlock; + } + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.req_paddr = __psp_pa(sev->guest_req_buf); + data.res_paddr = __psp_pa(sev->guest_resp_buf); + + /* + * Firmware failures are propagated on to guest, but any other failure + * condition along the way should be reported to userspace. E.g. if + * the PSP is dead and commands are timing out. + */ + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); + if (ret && !fw_err) + goto out_unlock; + + if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) { + ret = -EIO; + goto out_unlock; + } + + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(0, fw_err)); + + ret = 1; /* resume guest */ + +out_unlock: + mutex_unlock(&sev->guest_req_mutex); + return ret; +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -4403,6 +4534,9 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = 1; break; + case SVM_VMGEXIT_GUEST_REQUEST: + ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 20cbad253c9a..2d5b16517b67 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -97,6 +97,9 @@ struct kvm_sev_info { struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; void *snp_context; /* SNP guest context page */ + void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */ + void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ + struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ }; struct kvm_svm { diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h index 154a87a1eca9..fcdfea767fca 100644 --- a/include/uapi/linux/sev-guest.h +++ b/include/uapi/linux/sev-guest.h @@ -89,6 +89,9 @@ struct snp_ext_report_req { #define SNP_GUEST_FW_ERR_MASK GENMASK_ULL(31, 0) #define SNP_GUEST_VMM_ERR_SHIFT 32 #define SNP_GUEST_VMM_ERR(x) (((u64)x) << SNP_GUEST_VMM_ERR_SHIFT) +#define SNP_GUEST_FW_ERR(x) ((x) & SNP_GUEST_FW_ERR_MASK) +#define SNP_GUEST_ERR(vmm_err, fw_err) (SNP_GUEST_VMM_ERR(vmm_err) | \ + SNP_GUEST_FW_ERR(fw_err)) #define SNP_GUEST_VMM_ERR_INVALID_LEN 1 #define SNP_GUEST_VMM_ERR_BUSY 2 -- Gitee From cd97e872a527d42df79ca942a359e2160ae98a44 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Mon, 1 Jul 2024 17:31:47 -0500 Subject: [PATCH 66/75] x86/sev: Move sev_guest.h into common SEV header ANBZ: #34453 commit f55f3c3ac69f22ce092506244a31c08a1ca497ba upstream. sev_guest.h currently contains various definitions relating to the format of SNP_GUEST_REQUEST commands to SNP firmware. Currently only the sev-guest driver makes use of them, but when the KVM side of this is implemented there's a need to parse the SNP_GUEST_REQUEST header to determine whether additional information needs to be provided to the guest. Prepare for this by moving those definitions to a common header that's shared by host/guest code so that KVM can also make use of them. Reviewed-by: Tom Lendacky Reviewed-by: Liam Merwick Signed-off-by: Michael Roth Message-ID: <20240701223148.3798365-3-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/include/asm/sev.h | 48 +++++++++++++++++++ drivers/virt/coco/sev-guest/sev-guest.c | 2 - drivers/virt/coco/sev-guest/sev-guest.h | 63 ------------------------- 3 files changed, 48 insertions(+), 65 deletions(-) delete mode 100644 drivers/virt/coco/sev-guest/sev-guest.h diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 9d24caccac88..1698bd650ece 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -118,6 +118,54 @@ struct snp_req_data { unsigned int data_npages; }; +#define MAX_AUTHTAG_LEN 32 + +/* See SNP spec SNP_GUEST_REQUEST section for the structure */ +enum msg_type { + SNP_MSG_TYPE_INVALID = 0, + SNP_MSG_CPUID_REQ, + SNP_MSG_CPUID_RSP, + SNP_MSG_KEY_REQ, + SNP_MSG_KEY_RSP, + SNP_MSG_REPORT_REQ, + SNP_MSG_REPORT_RSP, + SNP_MSG_EXPORT_REQ, + SNP_MSG_EXPORT_RSP, + SNP_MSG_IMPORT_REQ, + SNP_MSG_IMPORT_RSP, + SNP_MSG_ABSORB_REQ, + SNP_MSG_ABSORB_RSP, + SNP_MSG_VMRK_REQ, + SNP_MSG_VMRK_RSP, + + SNP_MSG_TYPE_MAX +}; + +enum aead_algo { + SNP_AEAD_INVALID, + SNP_AEAD_AES_256_GCM, +}; + +struct snp_guest_msg_hdr { + u8 authtag[MAX_AUTHTAG_LEN]; + u64 msg_seqno; + u8 rsvd1[8]; + u8 algo; + u8 hdr_version; + u16 hdr_sz; + u8 msg_type; + u8 msg_version; + u16 msg_sz; + u32 rsvd2; + u8 msg_vmpck; + u8 rsvd3[35]; +} __packed; + +struct snp_guest_msg { + struct snp_guest_msg_hdr hdr; + u8 payload[4000]; +} __packed; + struct sev_guest_platform_data { u64 secrets_gpa; }; diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index 8ab4be41a963..198f5827fad3 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -29,8 +29,6 @@ #include #include -#include "sev-guest.h" - #define DEVICE_NAME "sev-guest" #define AAD_LEN 48 #define MSG_HDR_VER 1 diff --git a/drivers/virt/coco/sev-guest/sev-guest.h b/drivers/virt/coco/sev-guest/sev-guest.h deleted file mode 100644 index 21bda26fdb95..000000000000 --- a/drivers/virt/coco/sev-guest/sev-guest.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2021 Advanced Micro Devices, Inc. - * - * Author: Brijesh Singh - * - * SEV-SNP API spec is available at https://developer.amd.com/sev - */ - -#ifndef __VIRT_SEVGUEST_H__ -#define __VIRT_SEVGUEST_H__ - -#include - -#define MAX_AUTHTAG_LEN 32 - -/* See SNP spec SNP_GUEST_REQUEST section for the structure */ -enum msg_type { - SNP_MSG_TYPE_INVALID = 0, - SNP_MSG_CPUID_REQ, - SNP_MSG_CPUID_RSP, - SNP_MSG_KEY_REQ, - SNP_MSG_KEY_RSP, - SNP_MSG_REPORT_REQ, - SNP_MSG_REPORT_RSP, - SNP_MSG_EXPORT_REQ, - SNP_MSG_EXPORT_RSP, - SNP_MSG_IMPORT_REQ, - SNP_MSG_IMPORT_RSP, - SNP_MSG_ABSORB_REQ, - SNP_MSG_ABSORB_RSP, - SNP_MSG_VMRK_REQ, - SNP_MSG_VMRK_RSP, - - SNP_MSG_TYPE_MAX -}; - -enum aead_algo { - SNP_AEAD_INVALID, - SNP_AEAD_AES_256_GCM, -}; - -struct snp_guest_msg_hdr { - u8 authtag[MAX_AUTHTAG_LEN]; - u64 msg_seqno; - u8 rsvd1[8]; - u8 algo; - u8 hdr_version; - u16 hdr_sz; - u8 msg_type; - u8 msg_version; - u16 msg_sz; - u32 rsvd2; - u8 msg_vmpck; - u8 rsvd3[35]; -} __packed; - -struct snp_guest_msg { - struct snp_guest_msg_hdr hdr; - u8 payload[4000]; -} __packed; - -#endif /* __VIRT_SEVGUEST_H__ */ -- Gitee From 27b56d37f9c82017c56e081c7165509c7dfde7f1 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Mon, 1 Jul 2024 17:31:48 -0500 Subject: [PATCH 67/75] KVM: SEV: Provide support for SNP_EXTENDED_GUEST_REQUEST NAE event ANBZ: #34453 commit 74458e4859d85ea5963ac1c2bd7fa112f92a1d6d upstream. Version 2 of GHCB specification added support for the SNP Extended Guest Request Message NAE event. This event serves a nearly identical purpose to the previously-added SNP_GUEST_REQUEST event, but for certain message types it allows the guest to supply a buffer to be used for additional information in some cases. Currently the GHCB spec only defines extended handling of this sort in the case of attestation requests, where the additional buffer is used to supply a table of certificate data corresponding to the attestion report's signing key. Support for this extended handling will require additional KVM APIs to handle coordinating with userspace. Whether or not the hypervisor opts to provide this certificate data is optional. However, support for processing SNP_EXTENDED_GUEST_REQUEST GHCB requests is required by the GHCB 2.0 specification for SNP guests, so for now implement a stub implementation that provides an empty certificate table to the guest if it supplies an additional buffer, but otherwise behaves identically to SNP_GUEST_REQUEST. Reviewed-by: Carlos Bilbao Reviewed-by: Tom Lendacky Reviewed-by: Liam Merwick Signed-off-by: Michael Roth Message-ID: <20240701223148.3798365-4-michael.roth@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kvm/svm/sev.c | 56 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f3ba68484419..31de11c60b2b 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3584,6 +3584,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) goto vmgexit_err; break; case SVM_VMGEXIT_GUEST_REQUEST: + case SVM_VMGEXIT_EXT_GUEST_REQUEST: if (!sev_snp_guest(vcpu->kvm) || !PAGE_ALIGNED(control->exit_info_1) || !PAGE_ALIGNED(control->exit_info_2) || @@ -4260,6 +4261,58 @@ static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_ return ret; } +static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct kvm *kvm = svm->vcpu.kvm; + u8 msg_type; + + if (!sev_snp_guest(kvm)) + return -EINVAL; + + if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type), + &msg_type, 1)) + return -EIO; + + /* + * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for + * additional certificate data to be provided alongside the attestation + * report via the guest-provided data pages indicated by RAX/RBX. The + * certificate data is optional and requires additional KVM enablement + * to provide an interface for userspace to provide it, but KVM still + * needs to be able to handle extended guest requests either way. So + * provide a stub implementation that will always return an empty + * certificate table in the guest-provided data pages. + */ + if (msg_type == SNP_MSG_REPORT_REQ) { + struct kvm_vcpu *vcpu = &svm->vcpu; + u64 data_npages; + gpa_t data_gpa; + + if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm)) + goto request_invalid; + + data_gpa = vcpu->arch.regs[VCPU_REGS_RAX]; + data_npages = vcpu->arch.regs[VCPU_REGS_RBX]; + + if (!PAGE_ALIGNED(data_gpa)) + goto request_invalid; + + /* + * As per GHCB spec (see "SNP Extended Guest Request"), the + * certificate table is terminated by 24-bytes of zeroes. + */ + if (data_npages && kvm_clear_guest(kvm, data_gpa, 24)) + return -EIO; + } + + return snp_handle_guest_req(svm, req_gpa, resp_gpa); + +request_invalid: + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + return 1; /* resume guest */ +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -4537,6 +4590,9 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) case SVM_VMGEXIT_GUEST_REQUEST: ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); break; + case SVM_VMGEXIT_EXT_GUEST_REQUEST: + ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", -- Gitee From 8b4408b6768c0a7e01a183392ca4e9ef102d31d4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 17 Jul 2024 13:04:48 -0400 Subject: [PATCH 68/75] KVM: x86: disallow pre-fault for SNP VMs before initialization ANBZ: #34453 commit 5932ca411e533e7ad2b97c47b4357a05fa06c2a5 upstream. KVM_PRE_FAULT_MEMORY for an SNP guest can race with sev_gmem_post_populate() in bad ways. The following sequence for instance can potentially trigger an RMP fault: thread A, sev_gmem_post_populate: called thread B, sev_gmem_prepare: places below 'pfn' in a private state in RMP thread A, sev_gmem_post_populate: *vaddr = kmap_local_pfn(pfn + i); thread A, sev_gmem_post_populate: copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE); RMP #PF Fix this by only allowing KVM_PRE_FAULT_MEMORY to run after a guest's initial private memory contents have been finalized via KVM_SEV_SNP_LAUNCH_FINISH. Beyond fixing this issue, it just sort of makes sense to enforce this, since the KVM_PRE_FAULT_MEMORY documentation states: "KVM maps memory as if the vCPU generated a stage-2 read page fault" which sort of implies we should be acting on the same guest state that a vCPU would see post-launch after the initial guest memory is all set up. Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- Documentation/virt/kvm/api.rst | 6 ++++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 3 +++ arch/x86/kvm/svm/sev.c | 8 ++++++++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/x86.c | 3 +++ 6 files changed, 22 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 906a77d4b4af..9db3b46aa406 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6355,6 +6355,12 @@ for the current vCPU state. KVM maps memory as if the vCPU generated a stage-2 read page fault, e.g. faults in memory as needed, but doesn't break CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed. +In the case of confidential VM types where there is an initial set up of +private guest memory before the guest is 'finalized'/measured, this ioctl +should only be issued after completing all the necessary setup to put the +guest into a 'finalized' state so that the above semantics can be reliably +ensured. + In some cases, multiple vCPUs might share the page tables. In this case, the ioctl can be called in parallel. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9062ff09b15f..225bee2d3c50 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1288,6 +1288,7 @@ struct kvm_arch { u8 vm_type; bool has_private_mem; bool has_protected_state; + bool pre_fault_allowed; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 13ea28dababd..4b3443b98c65 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4704,6 +4704,9 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, u64 end; int r; + if (!vcpu->kvm->arch.pre_fault_allowed) + return -EOPNOTSUPP; + /* * reload is efficient when called repeatedly, so we can do it on * every iteration. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 31de11c60b2b..04ca5ede700a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2678,6 +2678,14 @@ static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) data->gctx_paddr = __psp_pa(sev->snp_context); ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); + /* + * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages + * can be given to the guest simply by marking the RMP entry as private. + * This can happen on first access and also with KVM_PRE_FAULT_MEMORY. + */ + if (!ret) + kvm->arch.pre_fault_allowed = true; + kfree(id_auth); e_free_id_block: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index fdeb4aa42a34..e1a4dff6478c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5111,6 +5111,7 @@ static int svm_vm_init(struct kvm *kvm) to_kvm_sev_info(kvm)->need_init = true; kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); + kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; } if (!pause_filter_count || !pause_filter_thresh) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8c9aae217a4b..df2f485d7514 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12619,6 +12619,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.vm_type = type; kvm->arch.has_private_mem = (type == KVM_X86_SW_PROTECTED_VM); + /* Decided by the vendor code for other VM types. */ + kvm->arch.pre_fault_allowed = + type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; ret = kvm_page_track_init(kvm); if (ret) -- Gitee From 30a909af4cc4918bbf0e42d029caace6284b6dbe Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:44 -0400 Subject: [PATCH 69/75] KVM: guest_memfd: return folio from __kvm_gmem_get_pfn() ANBZ: #34453 commit d0d87226f535965b4dafc6ef79246456503a4503 upstream. Right now this is simply more consistent and avoids use of pfn_to_page() and put_page(). It will be put to more use in upcoming patches, to ensure that the up-to-date flag is set at the very end of both the kvm_gmem_get_pfn() and kvm_gmem_populate() flows. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index ef14e87a328b..16dbc24de25b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -545,34 +545,34 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) fput(file); } -static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) +static struct folio * +__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) { pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; struct folio *folio; struct page *page; - int r; if (file != slot->gmem.file) { WARN_ON_ONCE(slot->gmem.file); - return -EFAULT; + return ERR_PTR(-EFAULT); } gmem = file->private_data; if (xa_load(&gmem->bindings, index) != slot) { WARN_ON_ONCE(xa_load(&gmem->bindings, index)); - return -EIO; + return ERR_PTR(-EIO); } folio = kvm_gmem_get_folio(file_inode(file), index, prepare); if (IS_ERR(folio)) - return PTR_ERR(folio); + return folio; if (folio_test_hwpoison(folio)) { folio_unlock(folio); folio_put(folio); - return -EHWPOISON; + return ERR_PTR(-EHWPOISON); } page = folio_file_page(folio, index); @@ -581,25 +581,25 @@ static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, if (max_order) *max_order = 0; - r = 0; - folio_unlock(folio); - - return r; + return folio; } int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order) { struct file *file = kvm_gmem_get_file(slot); - int r; + struct folio *folio; if (!file) return -EFAULT; - r = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true); + folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true); fput(file); - return r; + if (IS_ERR(folio)) + return PTR_ERR(folio); + + return 0; } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); @@ -629,6 +629,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); for (i = 0; i < npages; i += (1 << max_order)) { + struct folio *folio; gfn_t gfn = start_gfn + i; kvm_pfn_t pfn; @@ -637,9 +638,11 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false); - if (ret) + folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); break; + } if (!IS_ALIGNED(gfn, (1 << max_order)) || (npages - i) < (1 << max_order)) @@ -648,7 +651,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long p = src ? src + i * PAGE_SIZE : NULL; ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); - put_page(pfn_to_page(pfn)); + folio_put(folio); if (ret) break; } -- Gitee From cf4572527eafe571b4a1b5533815037b3f6b69ca Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:45 -0400 Subject: [PATCH 70/75] KVM: guest_memfd: delay folio_mark_uptodate() until after successful preparation ANBZ: #34453 commit d04c77d231223563405e8874fa7edfdc65e545fe upstream. The up-to-date flag as is now is not too useful; it tells guest_memfd not to overwrite the contents of a folio, but it doesn't say that the page is ready to be mapped into the guest. For encrypted guests, mapping a private page requires that the "preparation" phase has succeeded, and at the same time the same page cannot be prepared twice. So, ensure that folio_mark_uptodate() is only called on a prepared page. If kvm_gmem_prepare_folio() or the post_populate callback fail, the folio will not be marked up-to-date; it's not a problem to call clear_highpage() again on such a page prior to the next preparation attempt. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 16dbc24de25b..904cc3aed089 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -73,8 +73,6 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool for (i = 0; i < nr_pages; i++) clear_highpage(folio_page(folio, i)); - - folio_mark_uptodate(folio); } if (prepare) { @@ -84,6 +82,8 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool folio_put(folio); return ERR_PTR(r); } + + folio_mark_uptodate(folio); } /* @@ -650,6 +650,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long p = src ? src + i * PAGE_SIZE : NULL; ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); + if (!ret) + folio_mark_uptodate(folio); folio_put(folio); if (ret) -- Gitee From 9278ed38197aad17982bbb786b5b274813b5cdc4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:46 -0400 Subject: [PATCH 71/75] KVM: guest_memfd: do not go through struct page ANBZ: #34453 commit 7fbdda31b0a14ff45809fb27182b98dcbcbad5b0 upstream. We have a perfectly usable folio, use it to retrieve the pfn and order. All that's needed is a version of folio_file_page that returns a pfn. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 904cc3aed089..4614ab71427d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -13,6 +13,18 @@ struct kvm_gmem { struct list_head entry; }; +/** + * folio_file_pfn - like folio_file_page, but return a pfn. + * @folio: The folio which contains this index. + * @index: The index we want to look up. + * + * Return: The pfn for this index. + */ +static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) +{ + return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); +} + static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) { #ifdef CONFIG_HAVE_KVM_GMEM_PREPARE @@ -22,7 +34,6 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol list_for_each_entry(gmem, gmem_list, entry) { struct kvm_memory_slot *slot; struct kvm *kvm = gmem->kvm; - struct page *page; kvm_pfn_t pfn; gfn_t gfn; int rc; @@ -34,13 +45,12 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol if (!slot) continue; - page = folio_file_page(folio, index); - pfn = page_to_pfn(page); + pfn = folio_file_pfn(folio, index); gfn = slot->base_gfn + index - slot->gmem.pgoff; - rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page))); + rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); if (rc) { - pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", - index, gfn, pfn, rc); + pr_warn_ratelimited("gmem: Failed to prepare folio for GFN %llx PFN %llx error %d.\n", + gfn, pfn, rc); return rc; } } @@ -552,7 +562,6 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; struct folio *folio; - struct page *page; if (file != slot->gmem.file) { WARN_ON_ONCE(slot->gmem.file); @@ -575,9 +584,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, return ERR_PTR(-EHWPOISON); } - page = folio_file_page(folio, index); - - *pfn = page_to_pfn(page); + *pfn = folio_file_pfn(folio, index); if (max_order) *max_order = 0; -- Gitee From 78c1658fb6e0fde07d65852d11ef29f9f78a2a9b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:47 -0400 Subject: [PATCH 72/75] KVM: rename CONFIG_HAVE_KVM_GMEM_* to CONFIG_HAVE_KVM_ARCH_GMEM_* ANBZ: #34453 commit 564429a6bd8d26065b2cccffcaa9485359f74de7 upstream. Add "ARCH" to the symbols; shortly, the "prepare" phase will include both the arch-independent step to clear out contents left in the page by the host, and the arch-dependent step enabled by CONFIG_HAVE_KVM_GMEM_PREPARE. For consistency do the same for CONFIG_HAVE_KVM_GMEM_INVALIDATE as well. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- arch/x86/kvm/Kconfig | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 4 ++-- virt/kvm/Kconfig | 4 ++-- virt/kvm/guest_memfd.c | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index d682a04ce279..896f2eb0a16b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -130,8 +130,8 @@ config KVM_AMD_SEV depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM select KVM_GENERIC_PRIVATE_MEM - select HAVE_KVM_GMEM_PREPARE - select HAVE_KVM_GMEM_INVALIDATE + select HAVE_KVM_ARCH_GMEM_PREPARE + select HAVE_KVM_ARCH_GMEM_INVALIDATE help Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index df2f485d7514..8df9e19d39ed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13613,7 +13613,7 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); -#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE bool kvm_arch_gmem_prepare_needed(struct kvm *kvm) { return kvm->arch.vm_type == KVM_X86_SNP_VM; @@ -13625,7 +13625,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord } #endif -#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) { static_call_cond(kvm_x86_gmem_invalidate)(start, end); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3c998c5def59..da533ab27cd6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2482,7 +2482,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, } #endif /* CONFIG_KVM_PRIVATE_MEM */ -#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); #endif @@ -2514,7 +2514,7 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); -#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index a06a766b65b4..d9466a3a21e4 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -113,10 +113,10 @@ config KVM_GENERIC_PRIVATE_MEM select KVM_PRIVATE_MEM bool -config HAVE_KVM_GMEM_PREPARE +config HAVE_KVM_ARCH_GMEM_PREPARE bool depends on KVM_PRIVATE_MEM -config HAVE_KVM_GMEM_INVALIDATE +config HAVE_KVM_ARCH_GMEM_INVALIDATE bool depends on KVM_PRIVATE_MEM diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 4614ab71427d..b5a58d00084a 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -27,7 +27,7 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) { -#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE struct list_head *gmem_list = &inode->i_mapping->i_private_list; struct kvm_gmem *gmem; @@ -357,7 +357,7 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol return MF_DELAYED; } -#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE static void kvm_gmem_free_folio(struct folio *folio) { struct page *page = folio_page(folio, 0); @@ -372,7 +372,7 @@ static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, -#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE .free_folio = kvm_gmem_free_folio, #endif }; -- Gitee From 8d21b3b7f0855573aa4f46806713607b4d1f622b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:48 -0400 Subject: [PATCH 73/75] KVM: guest_memfd: return locked folio from __kvm_gmem_get_pfn ANBZ: #34453 commit 78c4293372fe1fe6060727dfbd5643552e3ff86d upstream. Allow testing the up-to-date flag in the caller without taking the lock again. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index b5a58d00084a..03f76a1e5ceb 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -59,6 +59,7 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol return 0; } +/* Returns a locked folio on success. */ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare) { struct folio *folio; @@ -555,6 +556,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) fput(file); } +/* Returns a locked folio on success. */ static struct folio * __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) @@ -588,7 +590,6 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, if (max_order) *max_order = 0; - folio_unlock(folio); return folio; } @@ -606,6 +607,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, if (IS_ERR(folio)) return PTR_ERR(folio); + folio_unlock(folio); return 0; } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); @@ -651,6 +653,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } + folio_unlock(folio); if (!IS_ALIGNED(gfn, (1 << max_order)) || (npages - i) < (1 << max_order)) max_order = 0; -- Gitee From 40b9538f90a5fbd6aa2ef8903e6cb372dab37abb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:49 -0400 Subject: [PATCH 74/75] KVM: guest_memfd: delay kvm_gmem_prepare_folio() until the memory is passed to the guest ANBZ: #34453 commit b85524314a3db687a87b190fd878aa1206b94b52 upstream. Initializing the contents of the folio on fallocate() is unnecessarily restrictive. It means that the page is registered with the firmware and then it cannot be touched anymore. In particular, this loses the possibility of using fallocate() to pre-allocate the page for SEV-SNP guests, because kvm_arch_gmem_prepare() then fails. It's only when the guest actually accesses the page (and therefore kvm_gmem_get_pfn() is called) that the page must be cleared from any stale host data and registered with the firmware. The up-to-date flag is clear if this has to be done (i.e. it is the first access and kvm_gmem_populate() has not been called). All in all, there are enough differences between kvm_gmem_get_pfn() and kvm_gmem_populate(), that it's better to separate the two flows completely. Extract the bulk of kvm_gmem_get_folio(), which take a folio and end up setting its up-to-date flag, to a new function kvm_gmem_prepare_folio(); these are now done only by the non-__-prefixed kvm_gmem_get_pfn(). As a bonus, __kvm_gmem_get_pfn() loses its ugly "bool prepare" argument. One difference is that fallocate(PUNCH_HOLE) can now race with a page fault. Potentially this causes a page to be prepared and into the filemap even after fallocate(PUNCH_HOLE). This is harmless, as it can be fixed by another hole punching operation, and can be avoided by clearing the private-page attribute prior to invoking fallocate(PUNCH_HOLE). This way, the page fault will cause an exit to user space. The previous semantics, where fallocate() could be used to prepare the pages in advance of running the guest, can be accessed with KVM_PRE_FAULT_MEMORY. For now, accessing a page in one VM will attempt to call kvm_arch_gmem_prepare() in all of those that have bound the guest_memfd. Cleaning this up is left to a separate patch. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv --- virt/kvm/guest_memfd.c | 110 ++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 44 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 03f76a1e5ceb..b03f2d63549f 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -25,7 +25,7 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); } -static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) +static int __kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) { #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE struct list_head *gmem_list = &inode->i_mapping->i_private_list; @@ -59,49 +59,63 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol return 0; } -/* Returns a locked folio on success. */ -static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare) +/* + * Process @folio, which contains @gfn, so that the guest can use it. + * The folio must be locked and the gfn must be contained in @slot. + * On successful return the guest sees a zero page so as to avoid + * leaking host data and the up-to-date flag is set. + */ +static int kvm_gmem_prepare_folio(struct file *file, struct kvm_memory_slot *slot, + gfn_t gfn, struct folio *folio) { - struct folio *folio; + unsigned long nr_pages, i; + pgoff_t index; + int r; - /* TODO: Support huge pages. */ - folio = filemap_grab_folio(inode->i_mapping, index); - if (IS_ERR(folio)) - return folio; + if (folio_test_uptodate(folio)) + return 0; + + nr_pages = folio_nr_pages(folio); + for (i = 0; i < nr_pages; i++) + clear_highpage(folio_page(folio, i)); /* - * Use the up-to-date flag to track whether or not the memory has been - * zeroed before being handed off to the guest. There is no backing - * storage for the memory, so the folio will remain up-to-date until - * it's removed. + * Preparing huge folios should always be safe, since it should + * be possible to split them later if needed. * - * TODO: Skip clearing pages when trusted firmware will do it when - * assigning memory to the guest. + * Right now the folio order is always going to be zero, but the + * code is ready for huge folios. The only assumption is that + * the base pgoff of memslots is naturally aligned with the + * requested page order, ensuring that huge folios can also use + * huge page table entries for GPA->HPA mapping. + * + * The order will be passed when creating the guest_memfd, and + * checked when creating memslots. */ - if (!folio_test_uptodate(folio)) { - unsigned long nr_pages = folio_nr_pages(folio); - unsigned long i; - - for (i = 0; i < nr_pages; i++) - clear_highpage(folio_page(folio, i)); - } - - if (prepare) { - int r = kvm_gmem_prepare_folio(inode, index, folio); - if (r < 0) { - folio_unlock(folio); - folio_put(folio); - return ERR_PTR(r); - } + WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); + index = gfn - slot->base_gfn + slot->gmem.pgoff; + index = ALIGN_DOWN(index, 1 << folio_order(folio)); + r = __kvm_gmem_prepare_folio(file_inode(file), index, folio); + if (!r) folio_mark_uptodate(folio); - } - /* - * Ignore accessed, referenced, and dirty flags. The memory is - * unevictable and there is no storage to write back to. - */ - return folio; + return r; +} + +/* + * Returns a locked folio on success. The caller is responsible for + * setting the up-to-date flag before the memory is mapped into the guest. + * There is no backing storage for the memory, so the folio will remain + * up-to-date until it's removed. + * + * Ignore accessed, referenced, and dirty flags. The memory is + * unevictable and there is no storage to write back to. + */ +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +{ + /* TODO: Support huge pages. */ + return filemap_grab_folio(inode->i_mapping, index); } static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, @@ -201,7 +215,7 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) break; } - folio = kvm_gmem_get_folio(inode, index, true); + folio = kvm_gmem_get_folio(inode, index); if (IS_ERR(folio)) { r = PTR_ERR(folio); break; @@ -559,7 +573,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) /* Returns a locked folio on success. */ static struct folio * __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) { pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; @@ -576,7 +590,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, return ERR_PTR(-EIO); } - folio = kvm_gmem_get_folio(file_inode(file), index, prepare); + folio = kvm_gmem_get_folio(file_inode(file), index); if (IS_ERR(folio)) return folio; @@ -598,17 +612,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, { struct file *file = kvm_gmem_get_file(slot); struct folio *folio; + int r = 0; if (!file) return -EFAULT; - folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true); - fput(file); - if (IS_ERR(folio)) - return PTR_ERR(folio); + folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order); + if (IS_ERR(folio)) { + r = PTR_ERR(folio); + goto out; + } + r = kvm_gmem_prepare_folio(file, slot, gfn, folio); folio_unlock(folio); - return 0; + if (r < 0) + folio_put(folio); + +out: + fput(file); + return r; } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); @@ -647,7 +669,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false); + folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; -- Gitee From 44bba003b56d32e2c6b9af12ee4e9cfdfaeba14e Mon Sep 17 00:00:00 2001 From: "PVS.NarasimhaRao" Date: Tue, 2 Jun 2026 18:05:16 +0800 Subject: [PATCH 75/75] anolis: configs: classify KVM memory attributes as L1 scope ANBZ: #34453 Add L1-RECOMMEND configs for KVM generic and private memory handling. Includes invalidate/prepare hooks and attribute definitions under x86 scope. Signed-off-by: PVS.NarasimhaRao --- .../L1-RECOMMEND/x86/CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE | 1 + .../configs/L1-RECOMMEND/x86/CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE | 1 + .../L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES | 1 + .../configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY | 1 + anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_PRIVATE_MEM | 1 + anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_PRIVATE_MEM | 1 + 6 files changed, 6 insertions(+) create mode 100644 anolis/configs/L1-RECOMMEND/x86/CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE create mode 100644 anolis/configs/L1-RECOMMEND/x86/CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE create mode 100644 anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES create mode 100644 anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY create mode 100644 anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_PRIVATE_MEM create mode 100644 anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_PRIVATE_MEM diff --git a/anolis/configs/L1-RECOMMEND/x86/CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE b/anolis/configs/L1-RECOMMEND/x86/CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE new file mode 100644 index 000000000000..d5e3841a448c --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/x86/CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE @@ -0,0 +1 @@ +CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE=y diff --git a/anolis/configs/L1-RECOMMEND/x86/CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE b/anolis/configs/L1-RECOMMEND/x86/CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE new file mode 100644 index 000000000000..1371f06c5b4c --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/x86/CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE @@ -0,0 +1 @@ +CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE=y diff --git a/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES b/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES new file mode 100644 index 000000000000..bb5b79df4f25 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES @@ -0,0 +1 @@ +CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES=y diff --git a/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY b/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY new file mode 100644 index 000000000000..7eea0654f002 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY @@ -0,0 +1 @@ +CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY=y diff --git a/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_PRIVATE_MEM b/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_PRIVATE_MEM new file mode 100644 index 000000000000..544d061dc069 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_GENERIC_PRIVATE_MEM @@ -0,0 +1 @@ +CONFIG_KVM_GENERIC_PRIVATE_MEM=y diff --git a/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_PRIVATE_MEM b/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_PRIVATE_MEM new file mode 100644 index 000000000000..b4b74d6cb208 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/x86/CONFIG_KVM_PRIVATE_MEM @@ -0,0 +1 @@ +CONFIG_KVM_PRIVATE_MEM=y -- Gitee