mirror of
https://github.com/SlackBuildsOrg/slackbuilds
synced 2024-09-30 05:39:56 +02:00
9be84725e7
Signed-off-by: Mario Preksavec <mario@slackware.hr>
4115 lines
142 KiB
Diff
4115 lines
142 KiB
Diff
diff --git a/docs/man/xl.conf.pod.5 b/docs/man/xl.conf.pod.5
|
|
index da91b8626c..37262a7ef8 100644
|
|
--- a/docs/man/xl.conf.pod.5
|
|
+++ b/docs/man/xl.conf.pod.5
|
|
@@ -185,6 +185,28 @@ massively huge guests).
|
|
|
|
=back
|
|
|
|
+=item B<vm.cpumask>="CPULIST"
|
|
+
|
|
+=item B<vm.hvm.cpumask>="CPULIST"
|
|
+
|
|
+=item B<vm.pv.cpumask>="CPULIST"
|
|
+
|
|
+Global masks that are applied when creating guests and pinning vcpus
|
|
+to indicate which cpus they are allowed to run on. Specifically,
|
|
+C<vm.cpumask> applies to all guest types, C<vm.hvm.cpumask> applies to
|
|
+both HVM and PVH guests and C<vm.pv.cpumask> applies to PV guests.
|
|
+
|
|
+The hard affinity of guest's vcpus are logical-AND'ed with respective
|
|
+masks. If the resulting affinity mask is empty, operation will fail.
|
|
+
|
|
+Use --ignore-global-affinity-masks to skip applying global masks.
|
|
+
|
|
+The default value for these masks are all 1's, i.e. all cpus are allowed.
|
|
+
|
|
+Due to bug(s), these options may not interact well with other options
|
|
+concerning CPU affinity. One example is CPU pools. Users should always double
|
|
+check that the required affinity has taken effect.
|
|
+
|
|
=back
|
|
|
|
=head1 SEE ALSO
|
|
diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
|
|
index 075e5ea159..0886706368 100644
|
|
--- a/docs/misc/xen-command-line.markdown
|
|
+++ b/docs/misc/xen-command-line.markdown
|
|
@@ -489,10 +489,10 @@ accounting for hardware capabilities as enumerated via CPUID.
|
|
|
|
Currently accepted:
|
|
|
|
-The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`, `ssbd` are
|
|
-used by default if available and applicable. They can be ignored,
|
|
-e.g. `no-ibrsb`, at which point Xen won't use them itself, and won't offer
|
|
-them to guests.
|
|
+The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`,
|
|
+`l1d-flush` and `ssbd` are used by default if available and applicable. They can
|
|
+be ignored, e.g. `no-ibrsb`, at which point Xen won't use them itself, and
|
|
+won't offer them to guests.
|
|
|
|
### cpuid\_mask\_cpu (AMD only)
|
|
> `= fam_0f_rev_c | fam_0f_rev_d | fam_0f_rev_e | fam_0f_rev_f | fam_0f_rev_g | fam_10_rev_b | fam_10_rev_c | fam_11_rev_b`
|
|
@@ -936,6 +936,8 @@ version are 1 and 2.
|
|
use of grant table v2 without transitive grants is an ABI breakage from the
|
|
guests point of view.
|
|
|
|
+The usage of gnttab v2 is not security supported on ARM platforms.
|
|
+
|
|
### gnttab\_max\_frames
|
|
> `= <integer>`
|
|
|
|
@@ -1544,6 +1546,30 @@ do; there may be other custom operating systems which do. If you're
|
|
certain you don't plan on having PV guests which use this feature,
|
|
turning it off can reduce the attack surface.
|
|
|
|
+### pv-l1tf (x86)
|
|
+> `= List of [ <bool>, dom0=<bool>, domu=<bool> ]`
|
|
+
|
|
+> Default: `false` on believed-unaffected hardware, or in pv-shim mode.
|
|
+> `domu` on believed-affected hardware.
|
|
+
|
|
+Mitigations for L1TF / XSA-273 / CVE-2018-3620 for PV guests.
|
|
+
|
|
+For backwards compatibility, we may not alter an architecturally-legitimate
|
|
+pagetable entry a PV guest chooses to write. We can however force such a
|
|
+guest into shadow mode so that Xen controls the PTEs which are reachable by
|
|
+the CPU pagewalk.
|
|
+
|
|
+Shadowing is performed at the point where a PV guest first tries to write an
|
|
+L1TF-vulnerable PTE. Therefore, a PV guest kernel which has been updated with
|
|
+its own L1TF mitigations will not trigger shadow mode if it is well behaved.
|
|
+
|
|
+If CONFIG\_SHADOW\_PAGING is not compiled in, this mitigation instead crashes
|
|
+the guest when an L1TF-vulnerable PTE is written, which still allows updated,
|
|
+well-behaved PV guests to run, despite Shadow being compiled out.
|
|
+
|
|
+In the pv-shim case, Shadow is expected to be compiled out, and a malicious
|
|
+guest kernel can only leak data from the shim Xen, rather than the host Xen.
|
|
+
|
|
### pv-shim (x86)
|
|
> `= <boolean>`
|
|
|
|
@@ -1748,6 +1774,13 @@ Use `smap=hvm` to allow SMAP use by HVM guests only.
|
|
Flag to enable Supervisor Mode Execution Protection
|
|
Use `smep=hvm` to allow SMEP use by HVM guests only.
|
|
|
|
+### smt (x86)
|
|
+> `= <boolean>`
|
|
+
|
|
+Default: `true`
|
|
+
|
|
+Control bring up of multiple hyper-threads per CPU core.
|
|
+
|
|
### snb\_igd\_quirk
|
|
> `= <boolean> | cap | <integer>`
|
|
|
|
@@ -1758,7 +1791,8 @@ false disable the quirk workaround, which is also the default.
|
|
|
|
### spec-ctrl (x86)
|
|
> `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb}=<bool>,
|
|
-> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu}=<bool> ]`
|
|
+> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
|
|
+> l1d-flush}=<bool> ]`
|
|
|
|
Controls for speculative execution sidechannel mitigations. By default, Xen
|
|
will pick the most appropriate mitigations based on compiled in support,
|
|
@@ -1770,10 +1804,15 @@ extreme care.**
|
|
|
|
An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
|
|
mitigations, including pieces of infrastructure used to virtualise certain
|
|
-mitigation features for guests. Alternatively, a slightly more restricted
|
|
-`spec-ctrl=no-xen` can be used to turn off all of Xen's mitigations, while
|
|
-leaving the virtualisation support in place for guests to use. Use of a
|
|
-positive boolean value for either of these options is invalid.
|
|
+mitigation features for guests. This also includes settings which `xpti`,
|
|
+`smt`, `pv-l1tf` control, unless the respective option(s) have been
|
|
+specified earlier on the command line.
|
|
+
|
|
+Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
|
|
+turn off all of Xen's mitigations, while leaving the virtualisation support
|
|
+in place for guests to use.
|
|
+
|
|
+Use of a positive boolean value for either of these options is invalid.
|
|
|
|
The booleans `pv=`, `hvm=`, `msr-sc=` and `rsb=` offer fine grained control
|
|
over the alternative blocks used by Xen. These impact Xen's ability to
|
|
@@ -1813,6 +1852,12 @@ from using fully eager FPU context switches. This is currently implemented as
|
|
a global control. By default, Xen will choose to use fully eager context
|
|
switches on hardware believed to speculate past #NM exceptions.
|
|
|
|
+On hardware supporting L1D_FLUSH, the `l1d-flush=` option can be used to force
|
|
+or prevent Xen from issuing an L1 data cache flush on each VMEntry.
|
|
+Irrespective of Xen's setting, the feature is virtualised for HVM guests to
|
|
+use. By default, Xen will enable this mitigation on hardware believed to be
|
|
+vulnerable to L1TF.
|
|
+
|
|
### sync\_console
|
|
> `= <boolean>`
|
|
|
|
diff --git a/tools/examples/xl.conf b/tools/examples/xl.conf
|
|
index 374b6bbc2e..0446deb304 100644
|
|
--- a/tools/examples/xl.conf
|
|
+++ b/tools/examples/xl.conf
|
|
@@ -37,3 +37,8 @@
|
|
# (which can take a long time to find out if launching huge guests).
|
|
# see xl.conf(5) for details.
|
|
#claim_mode=1
|
|
+
|
|
+# Specify global vcpu hard affinity masks. See xl.conf(5) for details.
|
|
+#vm.cpumask="0-7"
|
|
+#vm.pv.cpumask="0-3"
|
|
+#vm.hvm.cpumask="3-7"
|
|
diff --git a/tools/libxl/libxl_cpuid.c b/tools/libxl/libxl_cpuid.c
|
|
index 7b0f594c3d..52e16c20ed 100644
|
|
--- a/tools/libxl/libxl_cpuid.c
|
|
+++ b/tools/libxl/libxl_cpuid.c
|
|
@@ -204,6 +204,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str)
|
|
{"avx512-4fmaps",0x00000007, 0, CPUID_REG_EDX, 3, 1},
|
|
{"ibrsb", 0x00000007, 0, CPUID_REG_EDX, 26, 1},
|
|
{"stibp", 0x00000007, 0, CPUID_REG_EDX, 27, 1},
|
|
+ {"l1d-flush", 0x00000007, 0, CPUID_REG_EDX, 28, 1},
|
|
{"arch-caps", 0x00000007, 0, CPUID_REG_EDX, 29, 1},
|
|
{"ssbd", 0x00000007, 0, CPUID_REG_EDX, 31, 1},
|
|
|
|
diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c
|
|
index e116339733..3888b4e158 100644
|
|
--- a/tools/misc/xen-cpuid.c
|
|
+++ b/tools/misc/xen-cpuid.c
|
|
@@ -143,7 +143,7 @@ static const char *str_7d0[32] =
|
|
[ 2] = "avx512_4vnniw", [ 3] = "avx512_4fmaps",
|
|
|
|
[26] = "ibrsb", [27] = "stibp",
|
|
- /* 28 */ [29] = "arch_caps",
|
|
+ [28] = "l1d_flush", [29] = "arch_caps",
|
|
/* 30 */ [31] = "ssbd",
|
|
};
|
|
|
|
diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml
|
|
index 13cf3b5bf4..5a8c377603 100644
|
|
--- a/tools/ocaml/xenstored/store.ml
|
|
+++ b/tools/ocaml/xenstored/store.ml
|
|
@@ -262,7 +262,8 @@ let path_write store perm path value =
|
|
Node.check_perm store.root perm Perms.WRITE;
|
|
Node.set_value store.root value, false
|
|
) else
|
|
- Path.apply_modify store.root path do_write, !node_created
|
|
+ let root = Path.apply_modify store.root path do_write in
|
|
+ root, !node_created
|
|
|
|
let path_rm store perm path =
|
|
let do_rm node name =
|
|
diff --git a/tools/xl/xl.c b/tools/xl/xl.c
|
|
index 179908b4f6..7d2142f16f 100644
|
|
--- a/tools/xl/xl.c
|
|
+++ b/tools/xl/xl.c
|
|
@@ -28,6 +28,9 @@
|
|
#include <libxl_utils.h>
|
|
#include <libxlutil.h>
|
|
#include "xl.h"
|
|
+#include "xl_parse.h"
|
|
+
|
|
+#include "xl_utils.h"
|
|
|
|
xentoollog_logger_stdiostream *logger;
|
|
int dryrun_only;
|
|
@@ -42,6 +45,9 @@ char *default_gatewaydev = NULL;
|
|
char *default_vifbackend = NULL;
|
|
char *default_remus_netbufscript = NULL;
|
|
char *default_colo_proxy_script = NULL;
|
|
+libxl_bitmap global_vm_affinity_mask;
|
|
+libxl_bitmap global_hvm_affinity_mask;
|
|
+libxl_bitmap global_pv_affinity_mask;
|
|
enum output_format default_output_format = OUTPUT_FORMAT_JSON;
|
|
int claim_mode = 1;
|
|
bool progress_use_cr = 0;
|
|
@@ -203,6 +209,26 @@ static void parse_global_config(const char *configfile,
|
|
if (!xlu_cfg_get_long (config, "max_maptrack_frames", &l, 0))
|
|
max_maptrack_frames = l;
|
|
|
|
+ libxl_bitmap_init(&global_vm_affinity_mask);
|
|
+ libxl_cpu_bitmap_alloc(ctx, &global_vm_affinity_mask, 0);
|
|
+ libxl_bitmap_init(&global_hvm_affinity_mask);
|
|
+ libxl_cpu_bitmap_alloc(ctx, &global_hvm_affinity_mask, 0);
|
|
+ libxl_bitmap_init(&global_pv_affinity_mask);
|
|
+ libxl_cpu_bitmap_alloc(ctx, &global_pv_affinity_mask, 0);
|
|
+
|
|
+ if (!xlu_cfg_get_string (config, "vm.cpumask", &buf, 0))
|
|
+ parse_cpurange(buf, &global_vm_affinity_mask);
|
|
+ else
|
|
+ libxl_bitmap_set_any(&global_vm_affinity_mask);
|
|
+ if (!xlu_cfg_get_string (config, "vm.hvm.cpumask", &buf, 0))
|
|
+ parse_cpurange(buf, &global_hvm_affinity_mask);
|
|
+ else
|
|
+ libxl_bitmap_set_any(&global_hvm_affinity_mask);
|
|
+ if (!xlu_cfg_get_string (config, "vm.pv.cpumask", &buf, 0))
|
|
+ parse_cpurange(buf, &global_pv_affinity_mask);
|
|
+ else
|
|
+ libxl_bitmap_set_any(&global_pv_affinity_mask);
|
|
+
|
|
xlu_cfg_destroy(config);
|
|
}
|
|
|
|
diff --git a/tools/xl/xl.h b/tools/xl/xl.h
|
|
index 4e784ff402..7e97144b50 100644
|
|
--- a/tools/xl/xl.h
|
|
+++ b/tools/xl/xl.h
|
|
@@ -41,6 +41,7 @@ struct domain_create {
|
|
int vncautopass;
|
|
int console_autoconnect;
|
|
int checkpointed_stream;
|
|
+ int ignore_global_affinity_masks;
|
|
const char *config_file;
|
|
char *extra_config; /* extra config string */
|
|
const char *restore_file;
|
|
@@ -279,6 +280,9 @@ extern char *default_colo_proxy_script;
|
|
extern char *blkdev_start;
|
|
extern int max_grant_frames;
|
|
extern int max_maptrack_frames;
|
|
+extern libxl_bitmap global_vm_affinity_mask;
|
|
+extern libxl_bitmap global_hvm_affinity_mask;
|
|
+extern libxl_bitmap global_pv_affinity_mask;
|
|
|
|
enum output_format {
|
|
OUTPUT_FORMAT_JSON,
|
|
@@ -294,6 +298,9 @@ typedef enum {
|
|
} domain_restart_type;
|
|
|
|
extern void printf_info_sexp(int domid, libxl_domain_config *d_config, FILE *fh);
|
|
+extern void apply_global_affinity_masks(libxl_domain_type type,
|
|
+ libxl_bitmap *vcpu_affinity_array,
|
|
+ unsigned int size);
|
|
|
|
#define XL_GLOBAL_CONFIG XEN_CONFIG_DIR "/xl.conf"
|
|
#define XL_LOCK_FILE XEN_LOCK_DIR "/xl"
|
|
diff --git a/tools/xl/xl_cmdtable.c b/tools/xl/xl_cmdtable.c
|
|
index bf2ced8140..54c2db6022 100644
|
|
--- a/tools/xl/xl_cmdtable.c
|
|
+++ b/tools/xl/xl_cmdtable.c
|
|
@@ -34,7 +34,8 @@ struct cmd_spec cmd_table[] = {
|
|
"-e Do not wait in the background for the death of the domain.\n"
|
|
"-V, --vncviewer Connect to the VNC display after the domain is created.\n"
|
|
"-A, --vncviewer-autopass\n"
|
|
- " Pass VNC password to viewer via stdin."
|
|
+ " Pass VNC password to viewer via stdin.\n"
|
|
+ "--ignore-global-affinity-masks Ignore global masks in xl.conf."
|
|
},
|
|
{ "config-update",
|
|
&main_config_update, 1, 1,
|
|
@@ -224,7 +225,8 @@ struct cmd_spec cmd_table[] = {
|
|
&main_vcpupin, 1, 1,
|
|
"Set which CPUs a VCPU can use",
|
|
"[option] <Domain> <VCPU|all> <Hard affinity|-|all> <Soft affinity|-|all>",
|
|
- "-f, --force undo an override pinning done by the kernel",
|
|
+ "-f, --force undo an override pinning done by the kernel\n"
|
|
+ "--ignore-global-affinity-masks Ignore global masks in xl.conf",
|
|
},
|
|
{ "vcpu-set",
|
|
&main_vcpuset, 0, 1,
|
|
diff --git a/tools/xl/xl_vcpu.c b/tools/xl/xl_vcpu.c
|
|
index 8e735b38c1..3384eeed06 100644
|
|
--- a/tools/xl/xl_vcpu.c
|
|
+++ b/tools/xl/xl_vcpu.c
|
|
@@ -68,6 +68,61 @@ static void print_domain_vcpuinfo(uint32_t domid, uint32_t nr_cpus)
|
|
libxl_vcpuinfo_list_free(vcpuinfo, nb_vcpu);
|
|
}
|
|
|
|
+void apply_global_affinity_masks(libxl_domain_type type,
|
|
+ libxl_bitmap *vcpu_affinity_array,
|
|
+ unsigned int size)
|
|
+{
|
|
+ libxl_bitmap *mask = &global_vm_affinity_mask;
|
|
+ libxl_bitmap *type_mask;
|
|
+ unsigned int i;
|
|
+
|
|
+ switch (type) {
|
|
+ case LIBXL_DOMAIN_TYPE_HVM:
|
|
+ case LIBXL_DOMAIN_TYPE_PVH:
|
|
+ type_mask = &global_hvm_affinity_mask;
|
|
+ break;
|
|
+ case LIBXL_DOMAIN_TYPE_PV:
|
|
+ type_mask = &global_pv_affinity_mask;
|
|
+ break;
|
|
+ default:
|
|
+ fprintf(stderr, "Unknown guest type\n");
|
|
+ exit(EXIT_FAILURE);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < size; i++) {
|
|
+ int rc;
|
|
+ libxl_bitmap *t = &vcpu_affinity_array[i];
|
|
+ libxl_bitmap b1, b2;
|
|
+
|
|
+ libxl_bitmap_init(&b1);
|
|
+ libxl_bitmap_init(&b2);
|
|
+
|
|
+ rc = libxl_bitmap_and(ctx, &b1, t, mask);
|
|
+ if (rc) {
|
|
+ fprintf(stderr, "libxl_bitmap_and errored\n");
|
|
+ exit(EXIT_FAILURE);
|
|
+ }
|
|
+ rc = libxl_bitmap_and(ctx, &b2, &b1, type_mask);
|
|
+ if (rc) {
|
|
+ fprintf(stderr, "libxl_bitmap_and errored\n");
|
|
+ exit(EXIT_FAILURE);
|
|
+ }
|
|
+
|
|
+ if (libxl_bitmap_is_empty(&b2)) {
|
|
+ fprintf(stderr, "vcpu hard affinity map is empty\n");
|
|
+ exit(EXIT_FAILURE);
|
|
+ }
|
|
+
|
|
+ /* Replace target bitmap with the result */
|
|
+ libxl_bitmap_dispose(t);
|
|
+ libxl_bitmap_init(t);
|
|
+ libxl_bitmap_copy_alloc(ctx, t, &b2);
|
|
+
|
|
+ libxl_bitmap_dispose(&b1);
|
|
+ libxl_bitmap_dispose(&b2);
|
|
+ }
|
|
+}
|
|
+
|
|
static void vcpulist(int argc, char **argv)
|
|
{
|
|
libxl_dominfo *dominfo;
|
|
@@ -118,6 +173,7 @@ int main_vcpupin(int argc, char **argv)
|
|
{
|
|
static struct option opts[] = {
|
|
{"force", 0, 0, 'f'},
|
|
+ {"ignore-global-affinity-masks", 0, 0, 'i'},
|
|
COMMON_LONG_OPTS
|
|
};
|
|
libxl_vcpuinfo *vcpuinfo;
|
|
@@ -132,15 +188,18 @@ int main_vcpupin(int argc, char **argv)
|
|
const char *vcpu, *hard_str, *soft_str;
|
|
char *endptr;
|
|
int opt, nb_cpu, nb_vcpu, rc = EXIT_FAILURE;
|
|
- bool force = false;
|
|
+ bool force = false, ignore_masks = false;
|
|
|
|
libxl_bitmap_init(&cpumap_hard);
|
|
libxl_bitmap_init(&cpumap_soft);
|
|
|
|
- SWITCH_FOREACH_OPT(opt, "f", opts, "vcpu-pin", 3) {
|
|
+ SWITCH_FOREACH_OPT(opt, "fi", opts, "vcpu-pin", 3) {
|
|
case 'f':
|
|
force = true;
|
|
break;
|
|
+ case 'i':
|
|
+ ignore_masks = true;
|
|
+ break;
|
|
default:
|
|
break;
|
|
}
|
|
@@ -222,6 +281,23 @@ int main_vcpupin(int argc, char **argv)
|
|
goto out;
|
|
}
|
|
|
|
+ /* Only hard affinity matters here */
|
|
+ if (!ignore_masks) {
|
|
+ libxl_domain_config d_config;
|
|
+
|
|
+ libxl_domain_config_init(&d_config);
|
|
+ rc = libxl_retrieve_domain_configuration(ctx, domid, &d_config);
|
|
+ if (rc) {
|
|
+ fprintf(stderr, "Could not retrieve domain configuration\n");
|
|
+ libxl_domain_config_dispose(&d_config);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ apply_global_affinity_masks(d_config.b_info.type, hard, 1);
|
|
+
|
|
+ libxl_domain_config_dispose(&d_config);
|
|
+ }
|
|
+
|
|
if (force) {
|
|
if (libxl_set_vcpuaffinity_force(ctx, domid, vcpuid, hard, soft)) {
|
|
fprintf(stderr, "Could not set affinity for vcpu `%ld'.\n",
|
|
diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c
|
|
index 89c2b25ded..a1d633795c 100644
|
|
--- a/tools/xl/xl_vmcontrol.c
|
|
+++ b/tools/xl/xl_vmcontrol.c
|
|
@@ -804,6 +804,36 @@ int create_domain(struct domain_create *dom_info)
|
|
parse_config_data(config_source, config_data, config_len, &d_config);
|
|
}
|
|
|
|
+ if (!dom_info->ignore_global_affinity_masks) {
|
|
+ libxl_domain_build_info *b_info = &d_config.b_info;
|
|
+
|
|
+ /* It is possible that no hard affinity is specified in config file.
|
|
+ * Generate hard affinity maps now if we care about those.
|
|
+ */
|
|
+ if (b_info->num_vcpu_hard_affinity == 0 &&
|
|
+ (!libxl_bitmap_is_full(&global_vm_affinity_mask) ||
|
|
+ (b_info->type == LIBXL_DOMAIN_TYPE_PV &&
|
|
+ !libxl_bitmap_is_full(&global_pv_affinity_mask)) ||
|
|
+ (b_info->type != LIBXL_DOMAIN_TYPE_PV &&
|
|
+ !libxl_bitmap_is_full(&global_hvm_affinity_mask))
|
|
+ )) {
|
|
+ b_info->num_vcpu_hard_affinity = b_info->max_vcpus;
|
|
+ b_info->vcpu_hard_affinity =
|
|
+ xmalloc(b_info->max_vcpus * sizeof(libxl_bitmap));
|
|
+
|
|
+ for (i = 0; i < b_info->num_vcpu_hard_affinity; i++) {
|
|
+ libxl_bitmap *m = &b_info->vcpu_hard_affinity[i];
|
|
+ libxl_bitmap_init(m);
|
|
+ libxl_cpu_bitmap_alloc(ctx, m, 0);
|
|
+ libxl_bitmap_set_any(m);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ apply_global_affinity_masks(b_info->type,
|
|
+ b_info->vcpu_hard_affinity,
|
|
+ b_info->num_vcpu_hard_affinity);
|
|
+ }
|
|
+
|
|
if (migrate_fd >= 0) {
|
|
if (d_config.c_info.name) {
|
|
/* when we receive a domain we get its name from the config
|
|
@@ -1124,7 +1154,7 @@ int main_create(int argc, char **argv)
|
|
const char *filename = NULL;
|
|
struct domain_create dom_info;
|
|
int paused = 0, debug = 0, daemonize = 1, console_autoconnect = 0,
|
|
- quiet = 0, monitor = 1, vnc = 0, vncautopass = 0;
|
|
+ quiet = 0, monitor = 1, vnc = 0, vncautopass = 0, ignore_masks = 0;
|
|
int opt, rc;
|
|
static struct option opts[] = {
|
|
{"dryrun", 0, 0, 'n'},
|
|
@@ -1132,6 +1162,7 @@ int main_create(int argc, char **argv)
|
|
{"defconfig", 1, 0, 'f'},
|
|
{"vncviewer", 0, 0, 'V'},
|
|
{"vncviewer-autopass", 0, 0, 'A'},
|
|
+ {"ignore-global-affinity-masks", 0, 0, 'i'},
|
|
COMMON_LONG_OPTS
|
|
};
|
|
|
|
@@ -1142,7 +1173,7 @@ int main_create(int argc, char **argv)
|
|
argc--; argv++;
|
|
}
|
|
|
|
- SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVA", opts, "create", 0) {
|
|
+ SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVAi", opts, "create", 0) {
|
|
case 'f':
|
|
filename = optarg;
|
|
break;
|
|
@@ -1174,6 +1205,9 @@ int main_create(int argc, char **argv)
|
|
case 'A':
|
|
vnc = vncautopass = 1;
|
|
break;
|
|
+ case 'i':
|
|
+ ignore_masks = 1;
|
|
+ break;
|
|
}
|
|
|
|
memset(&dom_info, 0, sizeof(dom_info));
|
|
@@ -1203,6 +1237,7 @@ int main_create(int argc, char **argv)
|
|
dom_info.vnc = vnc;
|
|
dom_info.vncautopass = vncautopass;
|
|
dom_info.console_autoconnect = console_autoconnect;
|
|
+ dom_info.ignore_global_affinity_masks = ignore_masks;
|
|
|
|
rc = create_domain(&dom_info);
|
|
if (rc < 0) {
|
|
#diff --git a/xen/Makefile b/xen/Makefile
|
|
#index 4d075c381f..a922a1b7b5 100644
|
|
#--- a/xen/Makefile
|
|
#+++ b/xen/Makefile
|
|
#@@ -2,7 +2,7 @@
|
|
# # All other places this is stored (eg. compile.h) should be autogenerated.
|
|
# export XEN_VERSION = 4
|
|
# export XEN_SUBVERSION = 11
|
|
#-export XEN_EXTRAVERSION ?= .0$(XEN_VENDORVERSION)
|
|
#+export XEN_EXTRAVERSION ?= .1-pre$(XEN_VENDORVERSION)
|
|
# export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
|
|
# -include xen-version
|
|
#
|
|
diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
|
|
index f64fc56739..cfba4a708c 100644
|
|
--- a/xen/arch/x86/Kconfig
|
|
+++ b/xen/arch/x86/Kconfig
|
|
@@ -72,6 +72,7 @@ config SHADOW_PAGING
|
|
* Running HVM guests on hardware lacking hardware paging support
|
|
(First-generation Intel VT-x or AMD SVM).
|
|
* Live migration of PV guests.
|
|
+ * L1TF sidechannel mitigation for PV guests.
|
|
|
|
Under a small number of specific workloads, shadow paging may be
|
|
deliberately used as a performance optimisation.
|
|
diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
|
|
index 458a3fe60c..76078b55b2 100644
|
|
--- a/xen/arch/x86/cpu/amd.c
|
|
+++ b/xen/arch/x86/cpu/amd.c
|
|
@@ -505,17 +505,23 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
|
|
u32 eax, ebx, ecx, edx;
|
|
|
|
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
|
|
- c->compute_unit_id = ebx & 0xFF;
|
|
c->x86_num_siblings = ((ebx >> 8) & 0x3) + 1;
|
|
+
|
|
+ if (c->x86 < 0x17)
|
|
+ c->compute_unit_id = ebx & 0xFF;
|
|
+ else {
|
|
+ c->cpu_core_id = ebx & 0xFF;
|
|
+ c->x86_max_cores /= c->x86_num_siblings;
|
|
+ }
|
|
}
|
|
|
|
if (opt_cpu_info)
|
|
printk("CPU %d(%d) -> Processor %d, %s %d\n",
|
|
cpu, c->x86_max_cores, c->phys_proc_id,
|
|
- cpu_has(c, X86_FEATURE_TOPOEXT) ? "Compute Unit" :
|
|
- "Core",
|
|
- cpu_has(c, X86_FEATURE_TOPOEXT) ? c->compute_unit_id :
|
|
- c->cpu_core_id);
|
|
+ c->compute_unit_id != INVALID_CUID ? "Compute Unit"
|
|
+ : "Core",
|
|
+ c->compute_unit_id != INVALID_CUID ? c->compute_unit_id
|
|
+ : c->cpu_core_id);
|
|
}
|
|
|
|
static void early_init_amd(struct cpuinfo_x86 *c)
|
|
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
|
|
index 528aff1811..fdb022875a 100644
|
|
--- a/xen/arch/x86/cpu/common.c
|
|
+++ b/xen/arch/x86/cpu/common.c
|
|
@@ -14,6 +14,7 @@
|
|
#include <public/sysctl.h> /* for XEN_INVALID_{SOCKET,CORE}_ID */
|
|
|
|
#include "cpu.h"
|
|
+#include "mcheck/x86_mca.h"
|
|
|
|
bool_t opt_arat = 1;
|
|
boolean_param("arat", opt_arat);
|
|
@@ -355,6 +356,9 @@ static void __init early_cpu_detect(void)
|
|
hap_paddr_bits = PADDR_BITS;
|
|
}
|
|
|
|
+ if (c->x86_vendor != X86_VENDOR_AMD)
|
|
+ park_offline_cpus = opt_mce;
|
|
+
|
|
initialize_cpu_data(0);
|
|
}
|
|
|
|
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
|
|
index a8c287d124..32273d9208 100644
|
|
--- a/xen/arch/x86/cpu/mcheck/mce.c
|
|
+++ b/xen/arch/x86/cpu/mcheck/mce.c
|
|
@@ -692,12 +692,15 @@ static void cpu_bank_free(unsigned int cpu)
|
|
|
|
mcabanks_free(poll);
|
|
mcabanks_free(clr);
|
|
+
|
|
+ per_cpu(poll_bankmask, cpu) = NULL;
|
|
+ per_cpu(mce_clear_banks, cpu) = NULL;
|
|
}
|
|
|
|
static int cpu_bank_alloc(unsigned int cpu)
|
|
{
|
|
- struct mca_banks *poll = mcabanks_alloc();
|
|
- struct mca_banks *clr = mcabanks_alloc();
|
|
+ struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc();
|
|
+ struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc();
|
|
|
|
if ( !poll || !clr )
|
|
{
|
|
@@ -725,7 +728,13 @@ static int cpu_callback(
|
|
|
|
case CPU_UP_CANCELED:
|
|
case CPU_DEAD:
|
|
- cpu_bank_free(cpu);
|
|
+ if ( !park_offline_cpus )
|
|
+ cpu_bank_free(cpu);
|
|
+ break;
|
|
+
|
|
+ case CPU_REMOVE:
|
|
+ if ( park_offline_cpus )
|
|
+ cpu_bank_free(cpu);
|
|
break;
|
|
}
|
|
|
|
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
|
|
index e5dd956a24..4474a34e34 100644
|
|
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
|
|
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
|
|
@@ -636,8 +636,6 @@ static void clear_cmci(void)
|
|
|
|
static void cpu_mcheck_disable(void)
|
|
{
|
|
- clear_in_cr4(X86_CR4_MCE);
|
|
-
|
|
if ( cmci_support && opt_mce )
|
|
clear_cmci();
|
|
}
|
|
diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c
|
|
index 207e2e712c..6e27f6ec8e 100644
|
|
--- a/xen/arch/x86/cpu/vpmu_intel.c
|
|
+++ b/xen/arch/x86/cpu/vpmu_intel.c
|
|
@@ -454,13 +454,11 @@ static int core2_vpmu_alloc_resource(struct vcpu *v)
|
|
|
|
if ( is_hvm_vcpu(v) )
|
|
{
|
|
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
|
|
- if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
|
|
+ if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) )
|
|
goto out_err;
|
|
|
|
- if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
|
|
+ if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) )
|
|
goto out_err;
|
|
- vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
|
|
}
|
|
|
|
core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) +
|
|
@@ -535,27 +533,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
|
|
uint64_t *enabled_cntrs;
|
|
|
|
if ( !core2_vpmu_msr_common_check(msr, &type, &index) )
|
|
- {
|
|
- /* Special handling for BTS */
|
|
- if ( msr == MSR_IA32_DEBUGCTLMSR )
|
|
- {
|
|
- supported |= IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS |
|
|
- IA32_DEBUGCTLMSR_BTINT;
|
|
-
|
|
- if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) )
|
|
- supported |= IA32_DEBUGCTLMSR_BTS_OFF_OS |
|
|
- IA32_DEBUGCTLMSR_BTS_OFF_USR;
|
|
- if ( !(msr_content & ~supported) &&
|
|
- vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
|
|
- return 0;
|
|
- if ( (msr_content & supported) &&
|
|
- !vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
|
|
- printk(XENLOG_G_WARNING
|
|
- "%pv: Debug Store unsupported on this CPU\n",
|
|
- current);
|
|
- }
|
|
return -EINVAL;
|
|
- }
|
|
|
|
ASSERT(!supported);
|
|
|
|
@@ -613,7 +591,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
|
|
return -EINVAL;
|
|
|
|
if ( is_hvm_vcpu(v) )
|
|
- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
|
|
+ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL,
|
|
&core2_vpmu_cxt->global_ctrl);
|
|
else
|
|
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
|
|
@@ -682,7 +660,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
|
|
return -EINVAL;
|
|
|
|
if ( is_hvm_vcpu(v) )
|
|
- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
|
|
+ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL,
|
|
&core2_vpmu_cxt->global_ctrl);
|
|
else
|
|
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
|
|
@@ -701,7 +679,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
|
|
else
|
|
{
|
|
if ( is_hvm_vcpu(v) )
|
|
- vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
|
|
+ vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
|
|
else
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
|
|
}
|
|
@@ -735,7 +713,7 @@ static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
|
|
break;
|
|
case MSR_CORE_PERF_GLOBAL_CTRL:
|
|
if ( is_hvm_vcpu(v) )
|
|
- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
|
|
+ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
|
|
else
|
|
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, *msr_content);
|
|
break;
|
|
diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
|
|
index beee47d0ed..5cc89e2b34 100644
|
|
--- a/xen/arch/x86/cpuid.c
|
|
+++ b/xen/arch/x86/cpuid.c
|
|
@@ -43,6 +43,11 @@ static int __init parse_xen_cpuid(const char *s)
|
|
if ( !val )
|
|
setup_clear_cpu_cap(X86_FEATURE_STIBP);
|
|
}
|
|
+ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
|
|
+ {
|
|
+ if ( !val )
|
|
+ setup_clear_cpu_cap(X86_FEATURE_L1D_FLUSH);
|
|
+ }
|
|
else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 )
|
|
{
|
|
if ( !val )
|
|
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
|
|
index 9850a782ec..c39cf2c6e5 100644
|
|
--- a/xen/arch/x86/domain.c
|
|
+++ b/xen/arch/x86/domain.c
|
|
@@ -107,10 +107,11 @@ static void play_dead(void)
|
|
local_irq_disable();
|
|
|
|
/*
|
|
- * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible,
|
|
- * as they may be freed at any time. In this case, heap corruption or
|
|
- * #PF can occur (when heap debugging is enabled). For example, even
|
|
- * printk() can involve tasklet scheduling, which touches per-cpu vars.
|
|
+ * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible,
|
|
+ * as they may be freed at any time if offline CPUs don't get parked. In
|
|
+ * this case, heap corruption or #PF can occur (when heap debugging is
|
|
+ * enabled). For example, even printk() can involve tasklet scheduling,
|
|
+ * which touches per-cpu vars.
|
|
*
|
|
* Consider very carefully when adding code to *dead_idle. Most hypervisor
|
|
* subsystems are unsafe to call.
|
|
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
|
|
index 8fbbf3aeb3..dd91038a67 100644
|
|
--- a/xen/arch/x86/domctl.c
|
|
+++ b/xen/arch/x86/domctl.c
|
|
@@ -225,7 +225,8 @@ static int update_domain_cpuid_info(struct domain *d,
|
|
*/
|
|
call_policy_changed = (is_hvm_domain(d) &&
|
|
((old_7d0 ^ p->feat.raw[0].d) &
|
|
- cpufeat_mask(X86_FEATURE_IBRSB)));
|
|
+ (cpufeat_mask(X86_FEATURE_IBRSB) |
|
|
+ cpufeat_mask(X86_FEATURE_L1D_FLUSH))));
|
|
break;
|
|
|
|
case 0xa:
|
|
@@ -1163,7 +1164,7 @@ long arch_do_domctl(
|
|
if ( _xcr0_accum )
|
|
{
|
|
if ( evc->size >= PV_XSAVE_HDR_SIZE + XSTATE_AREA_MIN_SIZE )
|
|
- ret = validate_xstate(_xcr0, _xcr0_accum,
|
|
+ ret = validate_xstate(d, _xcr0, _xcr0_accum,
|
|
&_xsave_area->xsave_hdr);
|
|
}
|
|
else if ( !_xcr0 )
|
|
@@ -1187,8 +1188,7 @@ long arch_do_domctl(
|
|
vcpu_pause(v);
|
|
v->arch.xcr0 = _xcr0;
|
|
v->arch.xcr0_accum = _xcr0_accum;
|
|
- if ( _xcr0_accum & XSTATE_NONLAZY )
|
|
- v->arch.nonlazy_xstate_used = 1;
|
|
+ v->arch.nonlazy_xstate_used = _xcr0_accum & XSTATE_NONLAZY;
|
|
compress_xsave_states(v, _xsave_area,
|
|
evc->size - PV_XSAVE_HDR_SIZE);
|
|
vcpu_unpause(v);
|
|
diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
|
|
index 4779b0d0d5..d997806272 100644
|
|
--- a/xen/arch/x86/genapic/x2apic.c
|
|
+++ b/xen/arch/x86/genapic/x2apic.c
|
|
@@ -201,18 +201,21 @@ static int update_clusterinfo(
|
|
if ( !cluster_cpus_spare )
|
|
cluster_cpus_spare = xzalloc(cpumask_t);
|
|
if ( !cluster_cpus_spare ||
|
|
- !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
|
|
+ !cond_alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
|
|
err = -ENOMEM;
|
|
break;
|
|
case CPU_UP_CANCELED:
|
|
case CPU_DEAD:
|
|
+ case CPU_REMOVE:
|
|
+ if ( park_offline_cpus == (action != CPU_REMOVE) )
|
|
+ break;
|
|
if ( per_cpu(cluster_cpus, cpu) )
|
|
{
|
|
cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu));
|
|
if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) )
|
|
- xfree(per_cpu(cluster_cpus, cpu));
|
|
+ XFREE(per_cpu(cluster_cpus, cpu));
|
|
}
|
|
- free_cpumask_var(per_cpu(scratch_mask, cpu));
|
|
+ FREE_CPUMASK_VAR(per_cpu(scratch_mask, cpu));
|
|
break;
|
|
}
|
|
|
|
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
|
|
index c23983cdff..4cbb688c05 100644
|
|
--- a/xen/arch/x86/hvm/hvm.c
|
|
+++ b/xen/arch/x86/hvm/hvm.c
|
|
@@ -907,6 +907,9 @@ const char *hvm_efer_valid(const struct vcpu *v, uint64_t value,
|
|
else
|
|
p = &host_cpuid_policy;
|
|
|
|
+ if ( value & ~EFER_KNOWN_MASK )
|
|
+ return "Unknown bits set";
|
|
+
|
|
if ( (value & EFER_SCE) && !p->extd.syscall )
|
|
return "SCE without feature";
|
|
|
|
@@ -1269,7 +1272,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
|
|
ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
|
|
h->cur += desc->length;
|
|
|
|
- err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum,
|
|
+ err = validate_xstate(d, ctxt->xcr0, ctxt->xcr0_accum,
|
|
(const void *)&ctxt->save_area.xsave_hdr);
|
|
if ( err )
|
|
{
|
|
@@ -1324,8 +1327,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
|
|
|
|
v->arch.xcr0 = ctxt->xcr0;
|
|
v->arch.xcr0_accum = ctxt->xcr0_accum;
|
|
- if ( ctxt->xcr0_accum & XSTATE_NONLAZY )
|
|
- v->arch.nonlazy_xstate_used = 1;
|
|
+ v->arch.nonlazy_xstate_used = ctxt->xcr0_accum & XSTATE_NONLAZY;
|
|
compress_xsave_states(v, &ctxt->save_area,
|
|
size - offsetof(struct hvm_hw_cpu_xsave, save_area));
|
|
|
|
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
|
|
index 165500e3f2..b964c59dad 100644
|
|
--- a/xen/arch/x86/hvm/svm/svm.c
|
|
+++ b/xen/arch/x86/hvm/svm/svm.c
|
|
@@ -1432,24 +1432,18 @@ static void svm_inject_event(const struct x86_event *event)
|
|
* Xen must emulate enough of the event injection to be sure that a
|
|
* further fault shouldn't occur during delivery. This covers the fact
|
|
* that hardware doesn't perform DPL checking on injection.
|
|
- *
|
|
- * Also, it accounts for proper positioning of %rip for an event with trap
|
|
- * semantics (where %rip should point after the instruction) which suffers
|
|
- * a fault during injection (at which point %rip should point at the
|
|
- * instruction).
|
|
*/
|
|
if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION ||
|
|
- (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT ||
|
|
- event->type == X86_EVENTTYPE_SW_EXCEPTION)) )
|
|
+ (!cpu_has_svm_nrips && (event->type >= X86_EVENTTYPE_SW_INTERRUPT)) )
|
|
svm_emul_swint_injection(&_event);
|
|
|
|
- switch ( _event.vector )
|
|
+ switch ( _event.vector | -(_event.type == X86_EVENTTYPE_SW_INTERRUPT) )
|
|
{
|
|
case TRAP_debug:
|
|
if ( regs->eflags & X86_EFLAGS_TF )
|
|
{
|
|
__restore_debug_registers(vmcb, curr);
|
|
- vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000);
|
|
+ vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | DR_STEP);
|
|
}
|
|
/* fall through */
|
|
case TRAP_int3:
|
|
@@ -1459,6 +1453,13 @@ static void svm_inject_event(const struct x86_event *event)
|
|
domain_pause_for_debugger();
|
|
return;
|
|
}
|
|
+ break;
|
|
+
|
|
+ case TRAP_page_fault:
|
|
+ ASSERT(_event.type == X86_EVENTTYPE_HW_EXCEPTION);
|
|
+ curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
|
|
+ vmcb_set_cr2(vmcb, _event.cr2);
|
|
+ break;
|
|
}
|
|
|
|
if ( unlikely(eventinj.fields.v) &&
|
|
@@ -1481,13 +1482,9 @@ static void svm_inject_event(const struct x86_event *event)
|
|
* icebp, software events with trap semantics need emulating, so %rip in
|
|
* the trap frame points after the instruction.
|
|
*
|
|
- * The x86 emulator (if requested by the x86_swint_emulate_* choice) will
|
|
- * have performed checks such as presence/dpl/etc and believes that the
|
|
- * event injection will succeed without faulting.
|
|
- *
|
|
- * The x86 emulator will always provide fault semantics for software
|
|
- * events, with _trap.insn_len set appropriately. If the injection
|
|
- * requires emulation, move %rip forwards at this point.
|
|
+ * svm_emul_swint_injection() has already confirmed that events with trap
|
|
+ * semantics won't fault on injection. Position %rip/NextRIP suitably,
|
|
+ * and restrict the event type to what hardware will tolerate.
|
|
*/
|
|
switch ( _event.type )
|
|
{
|
|
@@ -1544,16 +1541,12 @@ static void svm_inject_event(const struct x86_event *event)
|
|
eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode);
|
|
vmcb->eventinj = eventinj;
|
|
|
|
- if ( _event.vector == TRAP_page_fault )
|
|
- {
|
|
- curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
|
|
- vmcb_set_cr2(vmcb, _event.cr2);
|
|
- HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2));
|
|
- }
|
|
+ if ( _event.vector == TRAP_page_fault &&
|
|
+ _event.type == X86_EVENTTYPE_HW_EXCEPTION )
|
|
+ HVMTRACE_LONG_2D(PF_INJECT, _event.error_code,
|
|
+ TRC_PAR_LONG(_event.cr2));
|
|
else
|
|
- {
|
|
HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code);
|
|
- }
|
|
}
|
|
|
|
static int svm_event_pending(struct vcpu *v)
|
|
diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S
|
|
index aa2f103895..afd552f2b9 100644
|
|
--- a/xen/arch/x86/hvm/vmx/entry.S
|
|
+++ b/xen/arch/x86/hvm/vmx/entry.S
|
|
@@ -41,6 +41,15 @@ ENTRY(vmx_asm_vmexit_handler)
|
|
SPEC_CTRL_ENTRY_FROM_HVM /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */
|
|
/* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
|
|
|
|
+ /* Hardware clears MSR_DEBUGCTL on VMExit. Reinstate it if debugging Xen. */
|
|
+ .macro restore_lbr
|
|
+ mov $IA32_DEBUGCTLMSR_LBR, %eax
|
|
+ mov $MSR_IA32_DEBUGCTLMSR, %ecx
|
|
+ xor %edx, %edx
|
|
+ wrmsr
|
|
+ .endm
|
|
+ ALTERNATIVE "", restore_lbr, X86_FEATURE_XEN_LBR
|
|
+
|
|
mov %rsp,%rdi
|
|
call vmx_vmexit_handler
|
|
|
|
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
|
|
index 258fc08f72..2ba0c40808 100644
|
|
--- a/xen/arch/x86/hvm/vmx/vmcs.c
|
|
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
|
|
@@ -38,6 +38,7 @@
|
|
#include <asm/flushtlb.h>
|
|
#include <asm/monitor.h>
|
|
#include <asm/shadow.h>
|
|
+#include <asm/spec_ctrl.h>
|
|
#include <asm/tboot.h>
|
|
#include <asm/apic.h>
|
|
|
|
@@ -996,6 +997,7 @@ static int construct_vmcs(struct vcpu *v)
|
|
struct domain *d = v->domain;
|
|
u32 vmexit_ctl = vmx_vmexit_control;
|
|
u32 vmentry_ctl = vmx_vmentry_control;
|
|
+ int rc = 0;
|
|
|
|
vmx_vmcs_enter(v);
|
|
|
|
@@ -1083,8 +1085,8 @@ static int construct_vmcs(struct vcpu *v)
|
|
|
|
if ( msr_bitmap == NULL )
|
|
{
|
|
- vmx_vmcs_exit(v);
|
|
- return -ENOMEM;
|
|
+ rc = -ENOMEM;
|
|
+ goto out;
|
|
}
|
|
|
|
memset(msr_bitmap, ~0, PAGE_SIZE);
|
|
@@ -1268,141 +1270,197 @@ static int construct_vmcs(struct vcpu *v)
|
|
if ( cpu_has_vmx_tsc_scaling )
|
|
__vmwrite(TSC_MULTIPLIER, d->arch.hvm_domain.tsc_scaling_ratio);
|
|
|
|
- vmx_vmcs_exit(v);
|
|
-
|
|
/* will update HOST & GUEST_CR3 as reqd */
|
|
paging_update_paging_modes(v);
|
|
|
|
vmx_vlapic_msr_changed(v);
|
|
|
|
- return 0;
|
|
+ if ( opt_l1d_flush && paging_mode_hap(d) )
|
|
+ rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
|
|
+ VMX_MSR_GUEST_LOADONLY);
|
|
+
|
|
+ out:
|
|
+ vmx_vmcs_exit(v);
|
|
+
|
|
+ return rc;
|
|
}
|
|
|
|
-static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
|
|
+/*
|
|
+ * Search an MSR list looking for an MSR entry, or the slot in which it should
|
|
+ * live (to keep the data sorted) if an entry is not found.
|
|
+ *
|
|
+ * The return pointer is guaranteed to be bounded by start and end. However,
|
|
+ * it may point at end, and may be invalid for the caller to dereference.
|
|
+ */
|
|
+static struct vmx_msr_entry *locate_msr_entry(
|
|
+ struct vmx_msr_entry *start, struct vmx_msr_entry *end, uint32_t msr)
|
|
{
|
|
- const u32 *msr = key;
|
|
- const struct vmx_msr_entry *entry = elt;
|
|
+ while ( start < end )
|
|
+ {
|
|
+ struct vmx_msr_entry *mid = start + (end - start) / 2;
|
|
|
|
- if ( *msr > entry->index )
|
|
- return 1;
|
|
- if ( *msr < entry->index )
|
|
- return -1;
|
|
+ if ( msr < mid->index )
|
|
+ end = mid;
|
|
+ else if ( msr > mid->index )
|
|
+ start = mid + 1;
|
|
+ else
|
|
+ return mid;
|
|
+ }
|
|
|
|
- return 0;
|
|
+ return start;
|
|
}
|
|
|
|
-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type)
|
|
+struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
|
|
+ enum vmx_msr_list_type type)
|
|
{
|
|
- struct vcpu *curr = current;
|
|
- unsigned int msr_count;
|
|
- struct vmx_msr_entry *msr_area;
|
|
+ const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
|
|
+ struct vmx_msr_entry *start = NULL, *ent, *end;
|
|
+ unsigned int substart = 0, subend = vmx->msr_save_count;
|
|
+ unsigned int total = vmx->msr_load_count;
|
|
|
|
- if ( type == VMX_GUEST_MSR )
|
|
- {
|
|
- msr_count = curr->arch.hvm_vmx.msr_count;
|
|
- msr_area = curr->arch.hvm_vmx.msr_area;
|
|
- }
|
|
- else
|
|
+ ASSERT(v == current || !vcpu_runnable(v));
|
|
+
|
|
+ switch ( type )
|
|
{
|
|
- ASSERT(type == VMX_HOST_MSR);
|
|
- msr_count = curr->arch.hvm_vmx.host_msr_count;
|
|
- msr_area = curr->arch.hvm_vmx.host_msr_area;
|
|
+ case VMX_MSR_HOST:
|
|
+ start = vmx->host_msr_area;
|
|
+ subend = vmx->host_msr_count;
|
|
+ total = subend;
|
|
+ break;
|
|
+
|
|
+ case VMX_MSR_GUEST:
|
|
+ start = vmx->msr_area;
|
|
+ break;
|
|
+
|
|
+ case VMX_MSR_GUEST_LOADONLY:
|
|
+ start = vmx->msr_area;
|
|
+ substart = subend;
|
|
+ subend = total;
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ ASSERT_UNREACHABLE();
|
|
}
|
|
|
|
- if ( msr_area == NULL )
|
|
+ if ( !start )
|
|
return NULL;
|
|
|
|
- return bsearch(&msr, msr_area, msr_count, sizeof(struct vmx_msr_entry),
|
|
- vmx_msr_entry_key_cmp);
|
|
+ end = start + total;
|
|
+ ent = locate_msr_entry(start + substart, start + subend, msr);
|
|
+
|
|
+ return ((ent < end) && (ent->index == msr)) ? ent : NULL;
|
|
}
|
|
|
|
-int vmx_read_guest_msr(u32 msr, u64 *val)
|
|
+int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
|
|
+ enum vmx_msr_list_type type)
|
|
{
|
|
- struct vmx_msr_entry *ent;
|
|
+ struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
|
|
+ struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
|
|
+ unsigned int substart, subend, total;
|
|
+ int rc;
|
|
|
|
- if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
|
|
+ ASSERT(v == current || !vcpu_runnable(v));
|
|
+
|
|
+ switch ( type )
|
|
{
|
|
- *val = ent->data;
|
|
- return 0;
|
|
- }
|
|
+ case VMX_MSR_HOST:
|
|
+ ptr = &vmx->host_msr_area;
|
|
+ substart = 0;
|
|
+ subend = vmx->host_msr_count;
|
|
+ total = subend;
|
|
+ break;
|
|
|
|
- return -ESRCH;
|
|
-}
|
|
+ case VMX_MSR_GUEST:
|
|
+ ptr = &vmx->msr_area;
|
|
+ substart = 0;
|
|
+ subend = vmx->msr_save_count;
|
|
+ total = vmx->msr_load_count;
|
|
+ break;
|
|
|
|
-int vmx_write_guest_msr(u32 msr, u64 val)
|
|
-{
|
|
- struct vmx_msr_entry *ent;
|
|
+ case VMX_MSR_GUEST_LOADONLY:
|
|
+ ptr = &vmx->msr_area;
|
|
+ substart = vmx->msr_save_count;
|
|
+ subend = vmx->msr_load_count;
|
|
+ total = subend;
|
|
+ break;
|
|
|
|
- if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
|
|
- {
|
|
- ent->data = val;
|
|
- return 0;
|
|
+ default:
|
|
+ ASSERT_UNREACHABLE();
|
|
+ return -EINVAL;
|
|
}
|
|
|
|
- return -ESRCH;
|
|
-}
|
|
-
|
|
-int vmx_add_msr(u32 msr, int type)
|
|
-{
|
|
- struct vcpu *curr = current;
|
|
- unsigned int idx, *msr_count;
|
|
- struct vmx_msr_entry **msr_area, *msr_area_elem;
|
|
+ vmx_vmcs_enter(v);
|
|
|
|
- if ( type == VMX_GUEST_MSR )
|
|
- {
|
|
- msr_count = &curr->arch.hvm_vmx.msr_count;
|
|
- msr_area = &curr->arch.hvm_vmx.msr_area;
|
|
- }
|
|
- else
|
|
+ /* Allocate memory on first use. */
|
|
+ if ( unlikely(!*ptr) )
|
|
{
|
|
- ASSERT(type == VMX_HOST_MSR);
|
|
- msr_count = &curr->arch.hvm_vmx.host_msr_count;
|
|
- msr_area = &curr->arch.hvm_vmx.host_msr_area;
|
|
- }
|
|
+ paddr_t addr;
|
|
|
|
- if ( *msr_area == NULL )
|
|
- {
|
|
- if ( (*msr_area = alloc_xenheap_page()) == NULL )
|
|
- return -ENOMEM;
|
|
+ if ( (*ptr = alloc_xenheap_page()) == NULL )
|
|
+ {
|
|
+ rc = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
|
|
- if ( type == VMX_GUEST_MSR )
|
|
+ addr = virt_to_maddr(*ptr);
|
|
+
|
|
+ switch ( type )
|
|
{
|
|
- __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area));
|
|
- __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
|
|
+ case VMX_MSR_HOST:
|
|
+ __vmwrite(VM_EXIT_MSR_LOAD_ADDR, addr);
|
|
+ break;
|
|
+
|
|
+ case VMX_MSR_GUEST:
|
|
+ case VMX_MSR_GUEST_LOADONLY:
|
|
+ __vmwrite(VM_EXIT_MSR_STORE_ADDR, addr);
|
|
+ __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, addr);
|
|
+ break;
|
|
}
|
|
- else
|
|
- __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
|
|
}
|
|
|
|
- for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ )
|
|
- if ( (*msr_area)[idx].index == msr )
|
|
- return 0;
|
|
+ start = *ptr;
|
|
+ end = start + total;
|
|
+ ent = locate_msr_entry(start + substart, start + subend, msr);
|
|
|
|
- if ( *msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
|
|
- return -ENOSPC;
|
|
+ if ( (ent < end) && (ent->index == msr) )
|
|
+ goto found;
|
|
|
|
- memmove(*msr_area + idx + 1, *msr_area + idx,
|
|
- sizeof(*msr_area_elem) * (*msr_count - idx));
|
|
+ /* If there isn't an existing entry for msr, insert room for one. */
|
|
+ if ( total == (PAGE_SIZE / sizeof(*ent)) )
|
|
+ {
|
|
+ rc = -ENOSPC;
|
|
+ goto out;
|
|
+ }
|
|
|
|
- msr_area_elem = *msr_area + idx;
|
|
- msr_area_elem->index = msr;
|
|
- msr_area_elem->mbz = 0;
|
|
+ memmove(ent + 1, ent, sizeof(*ent) * (end - ent));
|
|
|
|
- ++*msr_count;
|
|
+ ent->index = msr;
|
|
+ ent->mbz = 0;
|
|
|
|
- if ( type == VMX_GUEST_MSR )
|
|
+ switch ( type )
|
|
{
|
|
- msr_area_elem->data = 0;
|
|
- __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count);
|
|
- __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count);
|
|
- }
|
|
- else
|
|
- {
|
|
- rdmsrl(msr, msr_area_elem->data);
|
|
- __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
|
|
+ case VMX_MSR_HOST:
|
|
+ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, ++vmx->host_msr_count);
|
|
+ break;
|
|
+
|
|
+ case VMX_MSR_GUEST:
|
|
+ __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_save_count);
|
|
+
|
|
+ /* Fallthrough */
|
|
+ case VMX_MSR_GUEST_LOADONLY:
|
|
+ __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, ++vmx->msr_load_count);
|
|
+ break;
|
|
}
|
|
|
|
- return 0;
|
|
+ /* Set the msr's value. */
|
|
+ found:
|
|
+ ent->data = val;
|
|
+ rc = 0;
|
|
+
|
|
+ out:
|
|
+ vmx_vmcs_exit(v);
|
|
+
|
|
+ return rc;
|
|
}
|
|
|
|
void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector)
|
|
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
|
|
index 610c8d6eb9..b0fababede 100644
|
|
--- a/xen/arch/x86/hvm/vmx/vmx.c
|
|
+++ b/xen/arch/x86/hvm/vmx/vmx.c
|
|
@@ -583,6 +583,12 @@ static void vmx_cpuid_policy_changed(struct vcpu *v)
|
|
vmx_clear_msr_intercept(v, MSR_PRED_CMD, VMX_MSR_RW);
|
|
else
|
|
vmx_set_msr_intercept(v, MSR_PRED_CMD, VMX_MSR_RW);
|
|
+
|
|
+ /* MSR_FLUSH_CMD is safe to pass through if the guest knows about it. */
|
|
+ if ( cp->feat.l1d_flush )
|
|
+ vmx_clear_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
|
|
+ else
|
|
+ vmx_set_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
|
|
}
|
|
|
|
int vmx_guest_x86_mode(struct vcpu *v)
|
|
@@ -2758,8 +2764,10 @@ enum
|
|
|
|
#define LBR_FROM_SIGNEXT_2MSB ((1ULL << 59) | (1ULL << 60))
|
|
|
|
-#define FIXUP_LBR_TSX (1u << 0)
|
|
-#define FIXUP_BDW_ERRATUM_BDF14 (1u << 1)
|
|
+#define LBR_MSRS_INSERTED (1u << 0)
|
|
+#define LBR_FIXUP_TSX (1u << 1)
|
|
+#define LBR_FIXUP_BDF14 (1u << 2)
|
|
+#define LBR_FIXUP_MASK (LBR_FIXUP_TSX | LBR_FIXUP_BDF14)
|
|
|
|
static bool __read_mostly lbr_tsx_fixup_needed;
|
|
static bool __read_mostly bdw_erratum_bdf14_fixup_needed;
|
|
@@ -2822,7 +2830,7 @@ static int is_last_branch_msr(u32 ecx)
|
|
|
|
static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
|
|
{
|
|
- const struct vcpu *curr = current;
|
|
+ struct vcpu *curr = current;
|
|
|
|
HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x", msr);
|
|
|
|
@@ -2901,7 +2909,7 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
|
|
if ( passive_domain_do_rdmsr(msr, msr_content) )
|
|
goto done;
|
|
|
|
- if ( vmx_read_guest_msr(msr, msr_content) == 0 )
|
|
+ if ( vmx_read_guest_msr(curr, msr, msr_content) == 0 )
|
|
break;
|
|
|
|
if ( is_last_branch_msr(msr) )
|
|
@@ -3036,11 +3044,14 @@ void vmx_vlapic_msr_changed(struct vcpu *v)
|
|
static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
|
|
{
|
|
struct vcpu *v = current;
|
|
+ const struct cpuid_policy *cp = v->domain->arch.cpuid;
|
|
|
|
HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x, msr_value=%#"PRIx64, msr, msr_content);
|
|
|
|
switch ( msr )
|
|
{
|
|
+ uint64_t rsvd;
|
|
+
|
|
case MSR_IA32_SYSENTER_CS:
|
|
__vmwrite(GUEST_SYSENTER_CS, msr_content);
|
|
break;
|
|
@@ -3093,45 +3104,85 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
|
|
wrmsrl(MSR_SYSCALL_MASK, msr_content);
|
|
break;
|
|
|
|
- case MSR_IA32_DEBUGCTLMSR: {
|
|
- int i, rc = 0;
|
|
- uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF;
|
|
+ case MSR_IA32_DEBUGCTLMSR:
|
|
+ rsvd = ~(IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF);
|
|
|
|
- if ( boot_cpu_has(X86_FEATURE_RTM) )
|
|
- supported |= IA32_DEBUGCTLMSR_RTM;
|
|
- if ( msr_content & ~supported )
|
|
+ /* TODO: Wire vPMU settings properly through the CPUID policy */
|
|
+ if ( vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_BTS) )
|
|
{
|
|
- /* Perhaps some other bits are supported in vpmu. */
|
|
- if ( vpmu_do_wrmsr(msr, msr_content, supported) )
|
|
- break;
|
|
+ rsvd &= ~(IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS |
|
|
+ IA32_DEBUGCTLMSR_BTINT);
|
|
+
|
|
+ if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) )
|
|
+ rsvd &= ~(IA32_DEBUGCTLMSR_BTS_OFF_OS |
|
|
+ IA32_DEBUGCTLMSR_BTS_OFF_USR);
|
|
}
|
|
- if ( msr_content & IA32_DEBUGCTLMSR_LBR )
|
|
+
|
|
+ if ( cp->feat.rtm )
|
|
+ rsvd &= ~IA32_DEBUGCTLMSR_RTM;
|
|
+
|
|
+ if ( msr_content & rsvd )
|
|
+ goto gp_fault;
|
|
+
|
|
+ /*
|
|
+ * When a guest first enables LBR, arrange to save and restore the LBR
|
|
+ * MSRs and allow the guest direct access.
|
|
+ *
|
|
+ * MSR_DEBUGCTL and LBR has existed almost as long as MSRs have
|
|
+ * existed, and there is no architectural way to hide the feature, or
|
|
+ * fail the attempt to enable LBR.
|
|
+ *
|
|
+ * Unknown host LBR MSRs or hitting -ENOSPC with the guest load/save
|
|
+ * list are definitely hypervisor bugs, whereas -ENOMEM for allocating
|
|
+ * the load/save list is simply unlucky (and shouldn't occur with
|
|
+ * sensible management by the toolstack).
|
|
+ *
|
|
+ * Either way, there is nothing we can do right now to recover, and
|
|
+ * the guest won't execute correctly either. Simply crash the domain
|
|
+ * to make the failure obvious.
|
|
+ */
|
|
+ if ( !(v->arch.hvm_vmx.lbr_flags & LBR_MSRS_INSERTED) &&
|
|
+ (msr_content & IA32_DEBUGCTLMSR_LBR) )
|
|
{
|
|
const struct lbr_info *lbr = last_branch_msr_get();
|
|
- if ( lbr == NULL )
|
|
- break;
|
|
|
|
- for ( ; (rc == 0) && lbr->count; lbr++ )
|
|
- for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
|
|
- if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
|
|
+ if ( unlikely(!lbr) )
|
|
+ {
|
|
+ gprintk(XENLOG_ERR, "Unknown Host LBR MSRs\n");
|
|
+ domain_crash(v->domain);
|
|
+ return X86EMUL_OKAY;
|
|
+ }
|
|
+
|
|
+ for ( ; lbr->count; lbr++ )
|
|
+ {
|
|
+ unsigned int i;
|
|
+
|
|
+ for ( i = 0; i < lbr->count; i++ )
|
|
+ {
|
|
+ int rc = vmx_add_guest_msr(v, lbr->base + i, 0);
|
|
+
|
|
+ if ( unlikely(rc) )
|
|
{
|
|
- vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
|
|
- if ( lbr_tsx_fixup_needed )
|
|
- v->arch.hvm_vmx.lbr_fixup_enabled |= FIXUP_LBR_TSX;
|
|
- if ( bdw_erratum_bdf14_fixup_needed )
|
|
- v->arch.hvm_vmx.lbr_fixup_enabled |=
|
|
- FIXUP_BDW_ERRATUM_BDF14;
|
|
+ gprintk(XENLOG_ERR,
|
|
+ "Guest load/save list error %d\n", rc);
|
|
+ domain_crash(v->domain);
|
|
+ return X86EMUL_OKAY;
|
|
}
|
|
- }
|
|
|
|
- if ( (rc < 0) ||
|
|
- (msr_content && (vmx_add_host_load_msr(msr) < 0)) )
|
|
- hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC);
|
|
- else
|
|
- __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
|
|
+ vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ v->arch.hvm_vmx.lbr_flags |= LBR_MSRS_INSERTED;
|
|
+ if ( lbr_tsx_fixup_needed )
|
|
+ v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_TSX;
|
|
+ if ( bdw_erratum_bdf14_fixup_needed )
|
|
+ v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_BDF14;
|
|
+ }
|
|
|
|
+ __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
|
|
break;
|
|
- }
|
|
+
|
|
case MSR_IA32_FEATURE_CONTROL:
|
|
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
|
|
/* None of these MSRs are writeable. */
|
|
@@ -3154,7 +3205,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
|
|
if ( wrmsr_viridian_regs(msr, msr_content) )
|
|
break;
|
|
|
|
- if ( vmx_write_guest_msr(msr, msr_content) == 0 ||
|
|
+ if ( vmx_write_guest_msr(v, msr, msr_content) == 0 ||
|
|
is_last_branch_msr(msr) )
|
|
break;
|
|
|
|
@@ -3701,6 +3752,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
|
|
*/
|
|
__vmread(EXIT_QUALIFICATION, &exit_qualification);
|
|
HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
|
|
+ __restore_debug_registers(v);
|
|
write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
|
|
if ( !v->domain->debugger_attached )
|
|
{
|
|
@@ -4165,11 +4217,11 @@ out:
|
|
static void lbr_tsx_fixup(void)
|
|
{
|
|
struct vcpu *curr = current;
|
|
- unsigned int msr_count = curr->arch.hvm_vmx.msr_count;
|
|
+ unsigned int msr_count = curr->arch.hvm_vmx.msr_save_count;
|
|
struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
|
|
struct vmx_msr_entry *msr;
|
|
|
|
- if ( (msr = vmx_find_msr(lbr_from_start, VMX_GUEST_MSR)) != NULL )
|
|
+ if ( (msr = vmx_find_msr(curr, lbr_from_start, VMX_MSR_GUEST)) != NULL )
|
|
{
|
|
/*
|
|
* Sign extend into bits 61:62 while preserving bit 63
|
|
@@ -4179,15 +4231,15 @@ static void lbr_tsx_fixup(void)
|
|
msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
|
|
}
|
|
|
|
- if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_GUEST_MSR)) != NULL )
|
|
+ if ( (msr = vmx_find_msr(curr, lbr_lastint_from, VMX_MSR_GUEST)) != NULL )
|
|
msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
|
|
}
|
|
|
|
-static void sign_extend_msr(u32 msr, int type)
|
|
+static void sign_extend_msr(struct vcpu *v, u32 msr, int type)
|
|
{
|
|
struct vmx_msr_entry *entry;
|
|
|
|
- if ( (entry = vmx_find_msr(msr, type)) != NULL )
|
|
+ if ( (entry = vmx_find_msr(v, msr, type)) != NULL )
|
|
{
|
|
if ( entry->data & VADDR_TOP_BIT )
|
|
entry->data |= CANONICAL_MASK;
|
|
@@ -4198,6 +4250,8 @@ static void sign_extend_msr(u32 msr, int type)
|
|
|
|
static void bdw_erratum_bdf14_fixup(void)
|
|
{
|
|
+ struct vcpu *curr = current;
|
|
+
|
|
/*
|
|
* Occasionally, on certain Broadwell CPUs MSR_IA32_LASTINTTOIP has
|
|
* been observed to have the top three bits corrupted as though the
|
|
@@ -4207,17 +4261,17 @@ static void bdw_erratum_bdf14_fixup(void)
|
|
* erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by
|
|
* sign-extending into bits 48:63.
|
|
*/
|
|
- sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_GUEST_MSR);
|
|
- sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_GUEST_MSR);
|
|
+ sign_extend_msr(curr, MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST);
|
|
+ sign_extend_msr(curr, MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST);
|
|
}
|
|
|
|
static void lbr_fixup(void)
|
|
{
|
|
struct vcpu *curr = current;
|
|
|
|
- if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_LBR_TSX )
|
|
+ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_TSX )
|
|
lbr_tsx_fixup();
|
|
- if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_BDW_ERRATUM_BDF14 )
|
|
+ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_BDF14 )
|
|
bdw_erratum_bdf14_fixup();
|
|
}
|
|
|
|
@@ -4285,7 +4339,7 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs)
|
|
}
|
|
|
|
out:
|
|
- if ( unlikely(curr->arch.hvm_vmx.lbr_fixup_enabled) )
|
|
+ if ( unlikely(curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_MASK) )
|
|
lbr_fixup();
|
|
|
|
HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
|
|
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
|
index bcf46c0743..7d4871b791 100644
|
|
--- a/xen/arch/x86/mm.c
|
|
+++ b/xen/arch/x86/mm.c
|
|
@@ -613,6 +613,9 @@ static int alloc_segdesc_page(struct page_info *page)
|
|
return i == 512 ? 0 : -EINVAL;
|
|
}
|
|
|
|
+static int _get_page_type(struct page_info *page, unsigned long type,
|
|
+ bool preemptible);
|
|
+
|
|
static int get_page_and_type_from_mfn(
|
|
mfn_t mfn, unsigned long type, struct domain *d,
|
|
int partial, int preemptible)
|
|
@@ -624,9 +627,7 @@ static int get_page_and_type_from_mfn(
|
|
unlikely(!get_page_from_mfn(mfn, d)) )
|
|
return -EINVAL;
|
|
|
|
- rc = (preemptible ?
|
|
- get_page_type_preemptible(page, type) :
|
|
- (get_page_type(page, type) ? 0 : -EINVAL));
|
|
+ rc = _get_page_type(page, type, preemptible);
|
|
|
|
if ( unlikely(rc) && partial >= 0 &&
|
|
(!preemptible || page != current->arch.old_guest_table) )
|
|
@@ -1115,7 +1116,7 @@ get_page_from_l2e(
|
|
int rc;
|
|
|
|
if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
|
|
- return 1;
|
|
+ return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1;
|
|
|
|
if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
|
|
{
|
|
@@ -1146,7 +1147,7 @@ get_page_from_l3e(
|
|
int rc;
|
|
|
|
if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
|
|
- return 1;
|
|
+ return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1;
|
|
|
|
if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
|
|
{
|
|
@@ -1179,7 +1180,7 @@ get_page_from_l4e(
|
|
int rc;
|
|
|
|
if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
|
|
- return 1;
|
|
+ return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1;
|
|
|
|
if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
|
|
{
|
|
@@ -1389,6 +1390,13 @@ static int alloc_l1_table(struct page_info *page)
|
|
|
|
for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
|
|
{
|
|
+ if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
|
|
+ {
|
|
+ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
|
|
+ if ( ret )
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
switch ( ret = get_page_from_l1e(pl1e[i], d, d) )
|
|
{
|
|
default:
|
|
@@ -1409,6 +1417,7 @@ static int alloc_l1_table(struct page_info *page)
|
|
|
|
fail:
|
|
gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
|
|
+ out:
|
|
while ( i-- > 0 )
|
|
put_page_from_l1e(pl1e[i], d);
|
|
|
|
@@ -1456,8 +1465,7 @@ static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
|
|
return 1;
|
|
}
|
|
|
|
-static int alloc_l2_table(struct page_info *page, unsigned long type,
|
|
- int preemptible)
|
|
+static int alloc_l2_table(struct page_info *page, unsigned long type)
|
|
{
|
|
struct domain *d = page_get_owner(page);
|
|
unsigned long pfn = mfn_x(page_to_mfn(page));
|
|
@@ -1469,8 +1477,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,
|
|
|
|
for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
|
|
{
|
|
- if ( preemptible && i > page->nr_validated_ptes
|
|
- && hypercall_preempt_check() )
|
|
+ if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
|
|
{
|
|
page->nr_validated_ptes = i;
|
|
rc = -ERESTART;
|
|
@@ -1481,6 +1488,12 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,
|
|
(rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
|
|
continue;
|
|
|
|
+ if ( unlikely(rc == -ERESTART) )
|
|
+ {
|
|
+ page->nr_validated_ptes = i;
|
|
+ break;
|
|
+ }
|
|
+
|
|
if ( rc < 0 )
|
|
{
|
|
gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
|
|
@@ -1763,7 +1776,7 @@ static void free_l1_table(struct page_info *page)
|
|
}
|
|
|
|
|
|
-static int free_l2_table(struct page_info *page, int preemptible)
|
|
+static int free_l2_table(struct page_info *page)
|
|
{
|
|
struct domain *d = page_get_owner(page);
|
|
unsigned long pfn = mfn_x(page_to_mfn(page));
|
|
@@ -1777,7 +1790,7 @@ static int free_l2_table(struct page_info *page, int preemptible)
|
|
do {
|
|
if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
|
|
put_page_from_l2e(pl2e[i], pfn) == 0 &&
|
|
- preemptible && i && hypercall_preempt_check() )
|
|
+ i && hypercall_preempt_check() )
|
|
{
|
|
page->nr_validated_ptes = i;
|
|
err = -ERESTART;
|
|
@@ -2055,6 +2068,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
|
|
rc = -EBUSY;
|
|
}
|
|
}
|
|
+ else if ( pv_l1tf_check_l1e(pt_dom, nl1e) )
|
|
+ return -ERESTART;
|
|
else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
|
|
preserve_ad)) )
|
|
{
|
|
@@ -2118,6 +2133,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
|
|
rc = -EBUSY;
|
|
}
|
|
}
|
|
+ else if ( pv_l1tf_check_l2e(d, nl2e) )
|
|
+ return -ERESTART;
|
|
else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
|
|
preserve_ad)) )
|
|
{
|
|
@@ -2179,6 +2196,8 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
|
|
rc = -EFAULT;
|
|
}
|
|
}
|
|
+ else if ( pv_l1tf_check_l3e(d, nl3e) )
|
|
+ return -ERESTART;
|
|
else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
|
|
preserve_ad)) )
|
|
{
|
|
@@ -2244,6 +2263,8 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
|
|
rc = -EFAULT;
|
|
}
|
|
}
|
|
+ else if ( pv_l1tf_check_l4e(d, nl4e) )
|
|
+ return -ERESTART;
|
|
else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
|
|
preserve_ad)) )
|
|
{
|
|
@@ -2373,7 +2394,8 @@ static int alloc_page_type(struct page_info *page, unsigned long type,
|
|
rc = alloc_l1_table(page);
|
|
break;
|
|
case PGT_l2_page_table:
|
|
- rc = alloc_l2_table(page, type, preemptible);
|
|
+ ASSERT(preemptible);
|
|
+ rc = alloc_l2_table(page, type);
|
|
break;
|
|
case PGT_l3_page_table:
|
|
ASSERT(preemptible);
|
|
@@ -2463,7 +2485,8 @@ int free_page_type(struct page_info *page, unsigned long type,
|
|
rc = 0;
|
|
break;
|
|
case PGT_l2_page_table:
|
|
- rc = free_l2_table(page, preemptible);
|
|
+ ASSERT(preemptible);
|
|
+ rc = free_l2_table(page);
|
|
break;
|
|
case PGT_l3_page_table:
|
|
ASSERT(preemptible);
|
|
@@ -3550,12 +3573,9 @@ long do_mmuext_op(
|
|
}
|
|
|
|
if ( rc == -ERESTART )
|
|
- {
|
|
- ASSERT(i < count);
|
|
rc = hypercall_create_continuation(
|
|
__HYPERVISOR_mmuext_op, "hihi",
|
|
uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
|
|
- }
|
|
else if ( curr->arch.old_guest_table )
|
|
{
|
|
XEN_GUEST_HANDLE_PARAM(void) null;
|
|
@@ -3861,12 +3881,9 @@ long do_mmu_update(
|
|
}
|
|
|
|
if ( rc == -ERESTART )
|
|
- {
|
|
- ASSERT(i < count);
|
|
rc = hypercall_create_continuation(
|
|
__HYPERVISOR_mmu_update, "hihi",
|
|
ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
|
|
- }
|
|
else if ( curr->arch.old_guest_table )
|
|
{
|
|
XEN_GUEST_HANDLE_PARAM(void) null;
|
|
@@ -4121,7 +4138,13 @@ static int __do_update_va_mapping(
|
|
long do_update_va_mapping(unsigned long va, u64 val64,
|
|
unsigned long flags)
|
|
{
|
|
- return __do_update_va_mapping(va, val64, flags, current->domain);
|
|
+ int rc = __do_update_va_mapping(va, val64, flags, current->domain);
|
|
+
|
|
+ if ( rc == -ERESTART )
|
|
+ rc = hypercall_create_continuation(
|
|
+ __HYPERVISOR_update_va_mapping, "lll", va, val64, flags);
|
|
+
|
|
+ return rc;
|
|
}
|
|
|
|
long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
|
|
@@ -4138,6 +4161,46 @@ long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
|
|
|
|
put_pg_owner(pg_owner);
|
|
|
|
+ if ( rc == -ERESTART )
|
|
+ rc = hypercall_create_continuation(
|
|
+ __HYPERVISOR_update_va_mapping_otherdomain,
|
|
+ "llli", va, val64, flags, domid);
|
|
+
|
|
+ return rc;
|
|
+}
|
|
+
|
|
+int compat_update_va_mapping(unsigned int va, uint32_t lo, uint32_t hi,
|
|
+ unsigned int flags)
|
|
+{
|
|
+ int rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo,
|
|
+ flags, current->domain);
|
|
+
|
|
+ if ( rc == -ERESTART )
|
|
+ rc = hypercall_create_continuation(
|
|
+ __HYPERVISOR_update_va_mapping, "iiii", va, lo, hi, flags);
|
|
+
|
|
+ return rc;
|
|
+}
|
|
+
|
|
+int compat_update_va_mapping_otherdomain(unsigned int va,
|
|
+ uint32_t lo, uint32_t hi,
|
|
+ unsigned int flags, domid_t domid)
|
|
+{
|
|
+ struct domain *pg_owner;
|
|
+ int rc;
|
|
+
|
|
+ if ( (pg_owner = get_pg_owner(domid)) == NULL )
|
|
+ return -ESRCH;
|
|
+
|
|
+ rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, flags, pg_owner);
|
|
+
|
|
+ put_pg_owner(pg_owner);
|
|
+
|
|
+ if ( rc == -ERESTART )
|
|
+ rc = hypercall_create_continuation(
|
|
+ __HYPERVISOR_update_va_mapping_otherdomain,
|
|
+ "iiiii", va, lo, hi, flags, domid);
|
|
+
|
|
return rc;
|
|
}
|
|
|
|
diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
|
|
index 2b0445ffe9..dcee496eb0 100644
|
|
--- a/xen/arch/x86/mm/paging.c
|
|
+++ b/xen/arch/x86/mm/paging.c
|
|
@@ -873,6 +873,8 @@ void paging_dump_domain_info(struct domain *d)
|
|
printk(" paging assistance: ");
|
|
if ( paging_mode_shadow(d) )
|
|
printk("shadow ");
|
|
+ if ( paging_mode_sh_forced(d) )
|
|
+ printk("forced ");
|
|
if ( paging_mode_hap(d) )
|
|
printk("hap ");
|
|
if ( paging_mode_refcounts(d) )
|
|
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
|
|
index dd61b50eb7..fd42d734e7 100644
|
|
--- a/xen/arch/x86/mm/shadow/common.c
|
|
+++ b/xen/arch/x86/mm/shadow/common.c
|
|
@@ -3177,6 +3177,15 @@ static void sh_new_mode(struct domain *d, u32 new_mode)
|
|
ASSERT(paging_locked_by_me(d));
|
|
ASSERT(d != current->domain);
|
|
|
|
+ /*
|
|
+ * If PG_SH_forced has previously been activated because of writing an
|
|
+ * L1TF-vulnerable PTE, it must remain active for the remaining lifetime
|
|
+ * of the domain, even if the logdirty mode needs to be controlled for
|
|
+ * migration purposes.
|
|
+ */
|
|
+ if ( paging_mode_sh_forced(d) )
|
|
+ new_mode |= PG_SH_forced | PG_SH_enable;
|
|
+
|
|
d->arch.paging.mode = new_mode;
|
|
for_each_vcpu(d, v)
|
|
sh_update_paging_modes(v);
|
|
@@ -4057,6 +4066,33 @@ void shadow_audit_tables(struct vcpu *v)
|
|
|
|
#endif /* Shadow audit */
|
|
|
|
+#ifdef CONFIG_PV
|
|
+
|
|
+void pv_l1tf_tasklet(unsigned long data)
|
|
+{
|
|
+ struct domain *d = (void *)data;
|
|
+
|
|
+ domain_pause(d);
|
|
+ paging_lock(d);
|
|
+
|
|
+ if ( !paging_mode_sh_forced(d) && !d->is_dying )
|
|
+ {
|
|
+ int ret = shadow_one_bit_enable(d, PG_SH_forced);
|
|
+
|
|
+ if ( ret )
|
|
+ {
|
|
+ printk(XENLOG_G_ERR "d%d Failed to enable PG_SH_forced: %d\n",
|
|
+ d->domain_id, ret);
|
|
+ domain_crash(d);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ paging_unlock(d);
|
|
+ domain_unpause(d);
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_PV */
|
|
+
|
|
/*
|
|
* Local variables:
|
|
* mode: C
|
|
diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c
|
|
index 49140e46f0..f3f6d48668 100644
|
|
--- a/xen/arch/x86/mpparse.c
|
|
+++ b/xen/arch/x86/mpparse.c
|
|
@@ -68,19 +68,26 @@ physid_mask_t phys_cpu_present_map;
|
|
|
|
void __init set_nr_cpu_ids(unsigned int max_cpus)
|
|
{
|
|
+ unsigned int tot_cpus = num_processors + disabled_cpus;
|
|
+
|
|
if (!max_cpus)
|
|
- max_cpus = num_processors + disabled_cpus;
|
|
+ max_cpus = tot_cpus;
|
|
if (max_cpus > NR_CPUS)
|
|
max_cpus = NR_CPUS;
|
|
else if (!max_cpus)
|
|
max_cpus = 1;
|
|
printk(XENLOG_INFO "SMP: Allowing %u CPUs (%d hotplug CPUs)\n",
|
|
max_cpus, max_t(int, max_cpus - num_processors, 0));
|
|
- nr_cpu_ids = max_cpus;
|
|
+
|
|
+ if (!park_offline_cpus)
|
|
+ tot_cpus = max_cpus;
|
|
+ nr_cpu_ids = min(tot_cpus, NR_CPUS + 0u);
|
|
+ if (park_offline_cpus && nr_cpu_ids < num_processors)
|
|
+ printk(XENLOG_WARNING "SMP: Cannot bring up %u further CPUs\n",
|
|
+ num_processors - nr_cpu_ids);
|
|
|
|
#ifndef nr_cpumask_bits
|
|
- nr_cpumask_bits = (max_cpus + (BITS_PER_LONG - 1)) &
|
|
- ~(BITS_PER_LONG - 1);
|
|
+ nr_cpumask_bits = ROUNDUP(nr_cpu_ids, BITS_PER_LONG);
|
|
printk(XENLOG_DEBUG "NR_CPUS:%u nr_cpumask_bits:%u\n",
|
|
NR_CPUS, nr_cpumask_bits);
|
|
#endif
|
|
diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
|
|
index 1e12ccb729..1a591dd2b5 100644
|
|
--- a/xen/arch/x86/msr.c
|
|
+++ b/xen/arch/x86/msr.c
|
|
@@ -150,6 +150,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
|
|
case MSR_AMD_PATCHLOADER:
|
|
case MSR_IA32_UCODE_WRITE:
|
|
case MSR_PRED_CMD:
|
|
+ case MSR_FLUSH_CMD:
|
|
/* Write-only */
|
|
goto gp_fault;
|
|
|
|
@@ -254,6 +255,17 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
|
|
wrmsrl(MSR_PRED_CMD, val);
|
|
break;
|
|
|
|
+ case MSR_FLUSH_CMD:
|
|
+ if ( !cp->feat.l1d_flush )
|
|
+ goto gp_fault; /* MSR available? */
|
|
+
|
|
+ if ( val & ~FLUSH_CMD_L1D )
|
|
+ goto gp_fault; /* Rsvd bit set? */
|
|
+
|
|
+ if ( v == curr )
|
|
+ wrmsrl(MSR_FLUSH_CMD, val);
|
|
+ break;
|
|
+
|
|
case MSR_INTEL_MISC_FEATURES_ENABLES:
|
|
{
|
|
bool old_cpuid_faulting = vp->misc_features_enables.cpuid_faulting;
|
|
diff --git a/xen/arch/x86/oprofile/nmi_int.c b/xen/arch/x86/oprofile/nmi_int.c
|
|
index d8f5230906..3dfb8fef93 100644
|
|
--- a/xen/arch/x86/oprofile/nmi_int.c
|
|
+++ b/xen/arch/x86/oprofile/nmi_int.c
|
|
@@ -182,7 +182,7 @@ int nmi_reserve_counters(void)
|
|
if (!allocate_msrs())
|
|
return -ENOMEM;
|
|
|
|
- /* We walk a thin line between law and rape here.
|
|
+ /*
|
|
* We need to be careful to install our NMI handler
|
|
* without actually triggering any NMIs as this will
|
|
* break the core code horrifically.
|
|
diff --git a/xen/arch/x86/percpu.c b/xen/arch/x86/percpu.c
|
|
index c9997b7937..8be4ebddf4 100644
|
|
--- a/xen/arch/x86/percpu.c
|
|
+++ b/xen/arch/x86/percpu.c
|
|
@@ -28,7 +28,7 @@ static int init_percpu_area(unsigned int cpu)
|
|
char *p;
|
|
|
|
if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA )
|
|
- return -EBUSY;
|
|
+ return 0;
|
|
|
|
if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL )
|
|
return -ENOMEM;
|
|
@@ -76,9 +76,12 @@ static int cpu_percpu_callback(
|
|
break;
|
|
case CPU_UP_CANCELED:
|
|
case CPU_DEAD:
|
|
- free_percpu_area(cpu);
|
|
+ if ( !park_offline_cpus )
|
|
+ free_percpu_area(cpu);
|
|
break;
|
|
- default:
|
|
+ case CPU_REMOVE:
|
|
+ if ( park_offline_cpus )
|
|
+ free_percpu_area(cpu);
|
|
break;
|
|
}
|
|
|
|
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
|
|
index a4f0bd239d..3230ac6a22 100644
|
|
--- a/xen/arch/x86/pv/domain.c
|
|
+++ b/xen/arch/x86/pv/domain.c
|
|
@@ -13,6 +13,7 @@
|
|
#include <asm/invpcid.h>
|
|
#include <asm/spec_ctrl.h>
|
|
#include <asm/pv/domain.h>
|
|
+#include <asm/shadow.h>
|
|
|
|
static __read_mostly enum {
|
|
PCID_OFF,
|
|
@@ -209,6 +210,8 @@ int pv_vcpu_initialise(struct vcpu *v)
|
|
|
|
void pv_domain_destroy(struct domain *d)
|
|
{
|
|
+ pv_l1tf_domain_destroy(d);
|
|
+
|
|
destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
|
|
GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
|
|
|
|
@@ -229,6 +232,8 @@ int pv_domain_initialise(struct domain *d)
|
|
};
|
|
int rc = -ENOMEM;
|
|
|
|
+ pv_l1tf_domain_init(d);
|
|
+
|
|
d->arch.pv_domain.gdt_ldt_l1tab =
|
|
alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
|
|
if ( !d->arch.pv_domain.gdt_ldt_l1tab )
|
|
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
|
|
index aa8d5a7556..a3c0c2dd19 100644
|
|
--- a/xen/arch/x86/pv/ro-page-fault.c
|
|
+++ b/xen/arch/x86/pv/ro-page-fault.c
|
|
@@ -29,6 +29,7 @@
|
|
#include <asm/mm.h>
|
|
#include <asm/pci.h>
|
|
#include <asm/pv/mm.h>
|
|
+#include <asm/shadow.h>
|
|
|
|
#include "emulate.h"
|
|
#include "mm.h"
|
|
@@ -129,6 +130,10 @@ static int ptwr_emulated_update(unsigned long addr, intpte_t *p_old,
|
|
|
|
/* Check the new PTE. */
|
|
nl1e = l1e_from_intpte(val);
|
|
+
|
|
+ if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) && pv_l1tf_check_l1e(d, nl1e) )
|
|
+ return X86EMUL_RETRY;
|
|
+
|
|
switch ( ret = get_page_from_l1e(nl1e, d, d) )
|
|
{
|
|
default:
|
|
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
|
|
index a3172ca92c..3cd3e81b30 100644
|
|
--- a/xen/arch/x86/setup.c
|
|
+++ b/xen/arch/x86/setup.c
|
|
@@ -62,6 +62,9 @@ boolean_param("nosmp", opt_nosmp);
|
|
static unsigned int __initdata max_cpus;
|
|
integer_param("maxcpus", max_cpus);
|
|
|
|
+int8_t __read_mostly opt_smt = -1;
|
|
+boolean_param("smt", opt_smt);
|
|
+
|
|
/* opt_invpcid: If false, don't use INVPCID instruction even if available. */
|
|
static bool __initdata opt_invpcid = true;
|
|
boolean_param("invpcid", opt_invpcid);
|
|
@@ -665,7 +668,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
|
|
{
|
|
char *memmap_type = NULL;
|
|
char *cmdline, *kextra, *loader;
|
|
- unsigned int initrdidx;
|
|
+ unsigned int initrdidx, num_parked = 0;
|
|
multiboot_info_t *mbi;
|
|
module_t *mod;
|
|
unsigned long nr_pages, raw_max_page, modules_headroom, *module_map;
|
|
@@ -909,6 +912,18 @@ void __init noreturn __start_xen(unsigned long mbi_p)
|
|
/* Sanitise the raw E820 map to produce a final clean version. */
|
|
max_page = raw_max_page = init_e820(memmap_type, &e820_raw);
|
|
|
|
+ if ( !efi_enabled(EFI_BOOT) )
|
|
+ {
|
|
+ /*
|
|
+ * Supplement the heuristics in l1tf_calculations() by assuming that
|
|
+ * anything referenced in the E820 may be cacheable.
|
|
+ */
|
|
+ l1tf_safe_maddr =
|
|
+ max(l1tf_safe_maddr,
|
|
+ ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr +
|
|
+ e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE));
|
|
+ }
|
|
+
|
|
/* Create a temporary copy of the E820 map. */
|
|
memcpy(&boot_e820, &e820, sizeof(e820));
|
|
|
|
@@ -1494,7 +1509,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
|
|
else
|
|
{
|
|
set_nr_cpu_ids(max_cpus);
|
|
- max_cpus = nr_cpu_ids;
|
|
+ if ( !max_cpus )
|
|
+ max_cpus = nr_cpu_ids;
|
|
}
|
|
|
|
if ( xen_guest )
|
|
@@ -1617,16 +1633,30 @@ void __init noreturn __start_xen(unsigned long mbi_p)
|
|
/* Set up node_to_cpumask based on cpu_to_node[]. */
|
|
numa_add_cpu(i);
|
|
|
|
- if ( (num_online_cpus() < max_cpus) && !cpu_online(i) )
|
|
+ if ( (park_offline_cpus || num_online_cpus() < max_cpus) &&
|
|
+ !cpu_online(i) )
|
|
{
|
|
int ret = cpu_up(i);
|
|
if ( ret != 0 )
|
|
printk("Failed to bring up CPU %u (error %d)\n", i, ret);
|
|
+ else if ( num_online_cpus() > max_cpus ||
|
|
+ (!opt_smt &&
|
|
+ cpu_data[i].compute_unit_id == INVALID_CUID &&
|
|
+ cpumask_weight(per_cpu(cpu_sibling_mask, i)) > 1) )
|
|
+ {
|
|
+ ret = cpu_down(i);
|
|
+ if ( !ret )
|
|
+ ++num_parked;
|
|
+ else
|
|
+ printk("Could not re-offline CPU%u (%d)\n", i, ret);
|
|
+ }
|
|
}
|
|
}
|
|
}
|
|
|
|
printk("Brought up %ld CPUs\n", (long)num_online_cpus());
|
|
+ if ( num_parked )
|
|
+ printk(XENLOG_INFO "Parked %u CPUs\n", num_parked);
|
|
smp_cpus_done();
|
|
|
|
do_initcalls();
|
|
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
|
|
index d4478e6132..7e76cc3d68 100644
|
|
--- a/xen/arch/x86/smpboot.c
|
|
+++ b/xen/arch/x86/smpboot.c
|
|
@@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask;
|
|
cpumask_t cpu_online_map __read_mostly;
|
|
EXPORT_SYMBOL(cpu_online_map);
|
|
|
|
+bool __read_mostly park_offline_cpus;
|
|
+
|
|
unsigned int __read_mostly nr_sockets;
|
|
cpumask_t **__read_mostly socket_cpumask;
|
|
static cpumask_t *secondary_socket_cpumask;
|
|
@@ -234,33 +236,41 @@ static void link_thread_siblings(int cpu1, int cpu2)
|
|
cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1));
|
|
}
|
|
|
|
-static void set_cpu_sibling_map(int cpu)
|
|
+static void set_cpu_sibling_map(unsigned int cpu)
|
|
{
|
|
- int i;
|
|
+ unsigned int i;
|
|
struct cpuinfo_x86 *c = cpu_data;
|
|
|
|
cpumask_set_cpu(cpu, &cpu_sibling_setup_map);
|
|
|
|
cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
|
|
+ cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu));
|
|
+ cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
|
|
|
|
if ( c[cpu].x86_num_siblings > 1 )
|
|
{
|
|
for_each_cpu ( i, &cpu_sibling_setup_map )
|
|
{
|
|
- if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) {
|
|
- if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
|
|
- (c[cpu].compute_unit_id == c[i].compute_unit_id) )
|
|
+ if ( cpu == i || c[cpu].phys_proc_id != c[i].phys_proc_id )
|
|
+ continue;
|
|
+ if ( c[cpu].compute_unit_id != INVALID_CUID &&
|
|
+ c[i].compute_unit_id != INVALID_CUID )
|
|
+ {
|
|
+ if ( c[cpu].compute_unit_id == c[i].compute_unit_id )
|
|
link_thread_siblings(cpu, i);
|
|
- } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
|
|
- (c[cpu].cpu_core_id == c[i].cpu_core_id) ) {
|
|
- link_thread_siblings(cpu, i);
|
|
}
|
|
+ else if ( c[cpu].cpu_core_id != XEN_INVALID_CORE_ID &&
|
|
+ c[i].cpu_core_id != XEN_INVALID_CORE_ID )
|
|
+ {
|
|
+ if ( c[cpu].cpu_core_id == c[i].cpu_core_id )
|
|
+ link_thread_siblings(cpu, i);
|
|
+ }
|
|
+ else
|
|
+ printk(XENLOG_WARNING
|
|
+ "CPU%u: unclear relationship with CPU%u\n",
|
|
+ cpu, i);
|
|
}
|
|
}
|
|
- else
|
|
- {
|
|
- cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
|
|
- }
|
|
|
|
if ( c[cpu].x86_max_cores == 1 )
|
|
{
|
|
@@ -887,7 +897,14 @@ static void cleanup_cpu_root_pgt(unsigned int cpu)
|
|
}
|
|
}
|
|
|
|
-static void cpu_smpboot_free(unsigned int cpu)
|
|
+/*
|
|
+ * The 'remove' boolean controls whether a CPU is just getting offlined (and
|
|
+ * parked), or outright removed / offlined without parking. Parked CPUs need
|
|
+ * things like their stack, GDT, IDT, TSS, and per-CPU data still available.
|
|
+ * A few other items, in particular CPU masks, are also retained, as it's
|
|
+ * difficult to prove that they're entirely unreferenced from parked CPUs.
|
|
+ */
|
|
+static void cpu_smpboot_free(unsigned int cpu, bool remove)
|
|
{
|
|
unsigned int order, socket = cpu_to_socket(cpu);
|
|
struct cpuinfo_x86 *c = cpu_data;
|
|
@@ -898,15 +915,19 @@ static void cpu_smpboot_free(unsigned int cpu)
|
|
socket_cpumask[socket] = NULL;
|
|
}
|
|
|
|
- c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
|
|
- c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
|
|
- c[cpu].compute_unit_id = INVALID_CUID;
|
|
cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
|
|
|
|
- free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
|
|
- free_cpumask_var(per_cpu(cpu_core_mask, cpu));
|
|
- if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
|
|
- free_cpumask_var(per_cpu(scratch_cpumask, cpu));
|
|
+ if ( remove )
|
|
+ {
|
|
+ c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
|
|
+ c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
|
|
+ c[cpu].compute_unit_id = INVALID_CUID;
|
|
+
|
|
+ FREE_CPUMASK_VAR(per_cpu(cpu_sibling_mask, cpu));
|
|
+ FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu));
|
|
+ if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
|
|
+ FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu));
|
|
+ }
|
|
|
|
cleanup_cpu_root_pgt(cpu);
|
|
|
|
@@ -928,19 +949,21 @@ static void cpu_smpboot_free(unsigned int cpu)
|
|
}
|
|
|
|
order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
|
|
- free_xenheap_pages(per_cpu(gdt_table, cpu), order);
|
|
+ if ( remove )
|
|
+ FREE_XENHEAP_PAGES(per_cpu(gdt_table, cpu), order);
|
|
|
|
free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order);
|
|
|
|
- order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
|
|
- free_xenheap_pages(idt_tables[cpu], order);
|
|
- idt_tables[cpu] = NULL;
|
|
-
|
|
- if ( stack_base[cpu] != NULL )
|
|
+ if ( remove )
|
|
{
|
|
- memguard_unguard_stack(stack_base[cpu]);
|
|
- free_xenheap_pages(stack_base[cpu], STACK_ORDER);
|
|
- stack_base[cpu] = NULL;
|
|
+ order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
|
|
+ FREE_XENHEAP_PAGES(idt_tables[cpu], order);
|
|
+
|
|
+ if ( stack_base[cpu] )
|
|
+ {
|
|
+ memguard_unguard_stack(stack_base[cpu]);
|
|
+ FREE_XENHEAP_PAGES(stack_base[cpu], STACK_ORDER);
|
|
+ }
|
|
}
|
|
}
|
|
|
|
@@ -955,15 +978,17 @@ static int cpu_smpboot_alloc(unsigned int cpu)
|
|
if ( node != NUMA_NO_NODE )
|
|
memflags = MEMF_node(node);
|
|
|
|
- stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
|
|
+ if ( stack_base[cpu] == NULL )
|
|
+ stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
|
|
if ( stack_base[cpu] == NULL )
|
|
goto out;
|
|
memguard_guard_stack(stack_base[cpu]);
|
|
|
|
order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
|
|
- per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
|
|
+ gdt = per_cpu(gdt_table, cpu) ?: alloc_xenheap_pages(order, memflags);
|
|
if ( gdt == NULL )
|
|
goto out;
|
|
+ per_cpu(gdt_table, cpu) = gdt;
|
|
memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
|
|
BUILD_BUG_ON(NR_CPUS > 0x10000);
|
|
gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
|
|
@@ -975,7 +1000,8 @@ static int cpu_smpboot_alloc(unsigned int cpu)
|
|
gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
|
|
|
|
order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
|
|
- idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
|
|
+ if ( idt_tables[cpu] == NULL )
|
|
+ idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
|
|
if ( idt_tables[cpu] == NULL )
|
|
goto out;
|
|
memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
|
|
@@ -1003,16 +1029,16 @@ static int cpu_smpboot_alloc(unsigned int cpu)
|
|
(secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
|
|
goto out;
|
|
|
|
- if ( !(zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
|
|
- zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
|
|
- alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) )
|
|
+ if ( !(cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
|
|
+ cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
|
|
+ cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) )
|
|
goto out;
|
|
|
|
rc = 0;
|
|
|
|
out:
|
|
if ( rc )
|
|
- cpu_smpboot_free(cpu);
|
|
+ cpu_smpboot_free(cpu, true);
|
|
|
|
return rc;
|
|
}
|
|
@@ -1030,9 +1056,10 @@ static int cpu_smpboot_callback(
|
|
break;
|
|
case CPU_UP_CANCELED:
|
|
case CPU_DEAD:
|
|
- cpu_smpboot_free(cpu);
|
|
+ cpu_smpboot_free(cpu, !park_offline_cpus);
|
|
break;
|
|
- default:
|
|
+ case CPU_REMOVE:
|
|
+ cpu_smpboot_free(cpu, true);
|
|
break;
|
|
}
|
|
|
|
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
|
|
index 08e6784c4c..f0c50d6703 100644
|
|
--- a/xen/arch/x86/spec_ctrl.c
|
|
+++ b/xen/arch/x86/spec_ctrl.c
|
|
@@ -19,10 +19,13 @@
|
|
#include <xen/errno.h>
|
|
#include <xen/init.h>
|
|
#include <xen/lib.h>
|
|
+#include <xen/warning.h>
|
|
|
|
#include <asm/microcode.h>
|
|
#include <asm/msr.h>
|
|
#include <asm/processor.h>
|
|
+#include <asm/pv/shim.h>
|
|
+#include <asm/setup.h>
|
|
#include <asm/spec_ctrl.h>
|
|
#include <asm/spec_ctrl_asm.h>
|
|
|
|
@@ -45,11 +48,16 @@ static int8_t __initdata opt_ibrs = -1;
|
|
bool __read_mostly opt_ibpb = true;
|
|
bool __read_mostly opt_ssbd = false;
|
|
int8_t __read_mostly opt_eager_fpu = -1;
|
|
+int8_t __read_mostly opt_l1d_flush = -1;
|
|
|
|
bool __initdata bsp_delay_spec_ctrl;
|
|
uint8_t __read_mostly default_xen_spec_ctrl;
|
|
uint8_t __read_mostly default_spec_ctrl_flags;
|
|
|
|
+paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr;
|
|
+static bool __initdata cpu_has_bug_l1tf;
|
|
+static unsigned int __initdata l1d_maxphysaddr;
|
|
+
|
|
static int __init parse_bti(const char *s)
|
|
{
|
|
const char *ss;
|
|
@@ -124,6 +132,17 @@ static int __init parse_spec_ctrl(const char *s)
|
|
opt_msr_sc_pv = false;
|
|
opt_msr_sc_hvm = false;
|
|
|
|
+ opt_eager_fpu = 0;
|
|
+
|
|
+ if ( opt_xpti < 0 )
|
|
+ opt_xpti = 0;
|
|
+
|
|
+ if ( opt_smt < 0 )
|
|
+ opt_smt = 1;
|
|
+
|
|
+ if ( opt_pv_l1tf < 0 )
|
|
+ opt_pv_l1tf = 0;
|
|
+
|
|
disable_common:
|
|
opt_rsb_pv = false;
|
|
opt_rsb_hvm = false;
|
|
@@ -131,7 +150,8 @@ static int __init parse_spec_ctrl(const char *s)
|
|
opt_thunk = THUNK_JMP;
|
|
opt_ibrs = 0;
|
|
opt_ibpb = false;
|
|
- opt_eager_fpu = 0;
|
|
+ opt_ssbd = false;
|
|
+ opt_l1d_flush = 0;
|
|
}
|
|
else if ( val > 0 )
|
|
rc = -EINVAL;
|
|
@@ -187,6 +207,8 @@ static int __init parse_spec_ctrl(const char *s)
|
|
opt_ssbd = val;
|
|
else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
|
|
opt_eager_fpu = val;
|
|
+ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
|
|
+ opt_l1d_flush = val;
|
|
else
|
|
rc = -EINVAL;
|
|
|
|
@@ -197,6 +219,55 @@ static int __init parse_spec_ctrl(const char *s)
|
|
}
|
|
custom_param("spec-ctrl", parse_spec_ctrl);
|
|
|
|
+int8_t __read_mostly opt_pv_l1tf = -1;
|
|
+
|
|
+static __init int parse_pv_l1tf(const char *s)
|
|
+{
|
|
+ const char *ss;
|
|
+ int val, rc = 0;
|
|
+
|
|
+ /* Inhibit the defaults as an explicit choice has been given. */
|
|
+ if ( opt_pv_l1tf == -1 )
|
|
+ opt_pv_l1tf = 0;
|
|
+
|
|
+ /* Interpret 'pv-l1tf' alone in its positive boolean form. */
|
|
+ if ( *s == '\0' )
|
|
+ opt_xpti = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
|
|
+
|
|
+ do {
|
|
+ ss = strchr(s, ',');
|
|
+ if ( !ss )
|
|
+ ss = strchr(s, '\0');
|
|
+
|
|
+ switch ( parse_bool(s, ss) )
|
|
+ {
|
|
+ case 0:
|
|
+ opt_pv_l1tf = 0;
|
|
+ break;
|
|
+
|
|
+ case 1:
|
|
+ opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
|
|
+ opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOM0) |
|
|
+ (val ? OPT_PV_L1TF_DOM0 : 0));
|
|
+ else if ( (val = parse_boolean("domu", s, ss)) >= 0 )
|
|
+ opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOMU) |
|
|
+ (val ? OPT_PV_L1TF_DOMU : 0));
|
|
+ else
|
|
+ rc = -EINVAL;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ s = ss + 1;
|
|
+ } while ( *ss );
|
|
+
|
|
+ return rc;
|
|
+}
|
|
+custom_param("pv-l1tf", parse_pv_l1tf);
|
|
+
|
|
static void __init print_details(enum ind_thunk thunk, uint64_t caps)
|
|
{
|
|
unsigned int _7d0 = 0, e8b = 0, tmp;
|
|
@@ -210,22 +281,31 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
|
|
printk("Speculative mitigation facilities:\n");
|
|
|
|
/* Hardware features which pertain to speculative mitigations. */
|
|
- printk(" Hardware features:%s%s%s%s%s%s%s%s\n",
|
|
+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s\n",
|
|
(_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
|
|
(_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "",
|
|
+ (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
|
|
(_7d0 & cpufeat_mask(X86_FEATURE_SSBD)) ? " SSBD" : "",
|
|
(e8b & cpufeat_mask(X86_FEATURE_IBPB)) ? " IBPB" : "",
|
|
(caps & ARCH_CAPABILITIES_IBRS_ALL) ? " IBRS_ALL" : "",
|
|
(caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : "",
|
|
(caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
|
|
+ (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "",
|
|
(caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "");
|
|
|
|
- /* Compiled-in support which pertains to BTI mitigations. */
|
|
- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) )
|
|
- printk(" Compiled-in support: INDIRECT_THUNK\n");
|
|
+ /* Compiled-in support which pertains to mitigations. */
|
|
+ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
|
|
+ printk(" Compiled-in support:"
|
|
+#ifdef CONFIG_INDIRECT_THUNK
|
|
+ " INDIRECT_THUNK"
|
|
+#endif
|
|
+#ifdef CONFIG_SHADOW_PAGING
|
|
+ " SHADOW_PAGING"
|
|
+#endif
|
|
+ "\n");
|
|
|
|
/* Settings for Xen's protection, irrespective of guests. */
|
|
- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s\n",
|
|
+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s\n",
|
|
thunk == THUNK_NONE ? "N/A" :
|
|
thunk == THUNK_RETPOLINE ? "RETPOLINE" :
|
|
thunk == THUNK_LFENCE ? "LFENCE" :
|
|
@@ -234,7 +314,15 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
|
|
(default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-",
|
|
!boot_cpu_has(X86_FEATURE_SSBD) ? "" :
|
|
(default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-",
|
|
- opt_ibpb ? " IBPB" : "");
|
|
+ opt_ibpb ? " IBPB" : "",
|
|
+ opt_l1d_flush ? " L1D_FLUSH" : "");
|
|
+
|
|
+ /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
|
|
+ if ( cpu_has_bug_l1tf || opt_pv_l1tf )
|
|
+ printk(" L1TF: believed%s vulnerable, maxphysaddr L1D %u, CPUID %u"
|
|
+ ", Safe address %"PRIx64"\n",
|
|
+ cpu_has_bug_l1tf ? "" : " not",
|
|
+ l1d_maxphysaddr, paddr_bits, l1tf_safe_maddr);
|
|
|
|
/*
|
|
* Alternatives blocks for protecting against and/or virtualising
|
|
@@ -257,6 +345,10 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
|
|
printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s\n",
|
|
opt_xpti & OPT_XPTI_DOM0 ? "enabled" : "disabled",
|
|
opt_xpti & OPT_XPTI_DOMU ? "enabled" : "disabled");
|
|
+
|
|
+ printk(" PV L1TF shadowing: Dom0 %s, DomU %s\n",
|
|
+ opt_pv_l1tf & OPT_PV_L1TF_DOM0 ? "enabled" : "disabled",
|
|
+ opt_pv_l1tf & OPT_PV_L1TF_DOMU ? "enabled" : "disabled");
|
|
}
|
|
|
|
/* Calculate whether Retpoline is known-safe on this CPU. */
|
|
@@ -418,20 +510,159 @@ static bool __init should_use_eager_fpu(void)
|
|
}
|
|
}
|
|
|
|
-#define OPT_XPTI_DEFAULT 0xff
|
|
-uint8_t __read_mostly opt_xpti = OPT_XPTI_DEFAULT;
|
|
-
|
|
-static __init void xpti_init_default(bool force)
|
|
+/* Calculate whether this CPU is vulnerable to L1TF. */
|
|
+static __init void l1tf_calculations(uint64_t caps)
|
|
{
|
|
- uint64_t caps = 0;
|
|
+ bool hit_default = false;
|
|
+
|
|
+ l1d_maxphysaddr = paddr_bits;
|
|
+
|
|
+ /* L1TF is only known to affect Intel Family 6 processors at this time. */
|
|
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
|
|
+ boot_cpu_data.x86 == 6 )
|
|
+ {
|
|
+ switch ( boot_cpu_data.x86_model )
|
|
+ {
|
|
+ /*
|
|
+ * Core processors since at least Penryn are vulnerable.
|
|
+ */
|
|
+ case 0x17: /* Penryn */
|
|
+ case 0x1d: /* Dunnington */
|
|
+ cpu_has_bug_l1tf = true;
|
|
+ break;
|
|
+
|
|
+ case 0x1f: /* Auburndale / Havendale */
|
|
+ case 0x1e: /* Nehalem */
|
|
+ case 0x1a: /* Nehalem EP */
|
|
+ case 0x2e: /* Nehalem EX */
|
|
+ case 0x25: /* Westmere */
|
|
+ case 0x2c: /* Westmere EP */
|
|
+ case 0x2f: /* Westmere EX */
|
|
+ cpu_has_bug_l1tf = true;
|
|
+ l1d_maxphysaddr = 44;
|
|
+ break;
|
|
+
|
|
+ case 0x2a: /* SandyBridge */
|
|
+ case 0x2d: /* SandyBridge EP/EX */
|
|
+ case 0x3a: /* IvyBridge */
|
|
+ case 0x3e: /* IvyBridge EP/EX */
|
|
+ case 0x3c: /* Haswell */
|
|
+ case 0x3f: /* Haswell EX/EP */
|
|
+ case 0x45: /* Haswell D */
|
|
+ case 0x46: /* Haswell H */
|
|
+ case 0x3d: /* Broadwell */
|
|
+ case 0x47: /* Broadwell H */
|
|
+ case 0x4f: /* Broadwell EP/EX */
|
|
+ case 0x56: /* Broadwell D */
|
|
+ case 0x4e: /* Skylake M */
|
|
+ case 0x55: /* Skylake X */
|
|
+ case 0x5e: /* Skylake D */
|
|
+ case 0x66: /* Cannonlake */
|
|
+ case 0x67: /* Cannonlake? */
|
|
+ case 0x8e: /* Kabylake M */
|
|
+ case 0x9e: /* Kabylake D */
|
|
+ cpu_has_bug_l1tf = true;
|
|
+ l1d_maxphysaddr = 46;
|
|
+ break;
|
|
+
|
|
+ /*
|
|
+ * Atom processors are not vulnerable.
|
|
+ */
|
|
+ case 0x1c: /* Pineview */
|
|
+ case 0x26: /* Lincroft */
|
|
+ case 0x27: /* Penwell */
|
|
+ case 0x35: /* Cloverview */
|
|
+ case 0x36: /* Cedarview */
|
|
+ case 0x37: /* Baytrail / Valleyview (Silvermont) */
|
|
+ case 0x4d: /* Avaton / Rangely (Silvermont) */
|
|
+ case 0x4c: /* Cherrytrail / Brasswell */
|
|
+ case 0x4a: /* Merrifield */
|
|
+ case 0x5a: /* Moorefield */
|
|
+ case 0x5c: /* Goldmont */
|
|
+ case 0x5f: /* Denverton */
|
|
+ case 0x7a: /* Gemini Lake */
|
|
+ break;
|
|
+
|
|
+ /*
|
|
+ * Knights processors are not vulnerable.
|
|
+ */
|
|
+ case 0x57: /* Knights Landing */
|
|
+ case 0x85: /* Knights Mill */
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ /* Defer printk() until we've accounted for RDCL_NO. */
|
|
+ hit_default = true;
|
|
+ cpu_has_bug_l1tf = true;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Any processor advertising RDCL_NO should be not vulnerable to L1TF. */
|
|
+ if ( caps & ARCH_CAPABILITIES_RDCL_NO )
|
|
+ cpu_has_bug_l1tf = false;
|
|
+
|
|
+ if ( cpu_has_bug_l1tf && hit_default )
|
|
+ printk("Unrecognised CPU model %#x - assuming vulnerable to L1TF\n",
|
|
+ boot_cpu_data.x86_model);
|
|
+
|
|
+ /*
|
|
+ * L1TF safe address heuristics. These apply to the real hardware we are
|
|
+ * running on, and are best-effort-only if Xen is virtualised.
|
|
+ *
|
|
+ * The address mask which the L1D cache uses, which might be wider than
|
|
+ * the CPUID-reported maxphysaddr.
|
|
+ */
|
|
+ l1tf_addr_mask = ((1ul << l1d_maxphysaddr) - 1) & PAGE_MASK;
|
|
+
|
|
+ /*
|
|
+ * To be safe, l1tf_safe_maddr must be above the highest cacheable entity
|
|
+ * in system physical address space. However, to preserve space for
|
|
+ * paged-out metadata, it should be as low as possible above the highest
|
|
+ * cacheable address, so as to require fewer high-order bits being set.
|
|
+ *
|
|
+ * These heuristics are based on some guesswork to improve the likelihood
|
|
+ * of safety in the common case, including Linux's L1TF mitigation of
|
|
+ * inverting all address bits in a non-present PTE.
|
|
+ *
|
|
+ * - If L1D is wider than CPUID (Nehalem and later mobile/desktop/low end
|
|
+ * server), setting any address bit beyond CPUID maxphysaddr guarantees
|
|
+ * to make the PTE safe. This case doesn't require all the high-order
|
|
+ * bits being set, and doesn't require any other source of information
|
|
+ * for safety.
|
|
+ *
|
|
+ * - If L1D is the same as CPUID (Pre-Nehalem, or high end server), we
|
|
+ * must sacrifice high order bits from the real address space for
|
|
+ * safety. Therefore, make a blind guess that there is nothing
|
|
+ * cacheable in the top quarter of physical address space.
|
|
+ *
|
|
+ * It is exceedingly unlikely for machines to be populated with this
|
|
+ * much RAM (likely 512G on pre-Nehalem, 16T on Nehalem/Westmere, 64T on
|
|
+ * Sandybridge and later) due to the sheer volume of DIMMs this would
|
|
+ * actually take.
|
|
+ *
|
|
+ * However, it is possible to find machines this large, so the "top
|
|
+ * quarter" guess is supplemented to push the limit higher if references
|
|
+ * to cacheable mappings (E820/SRAT/EFI/etc) are found above the top
|
|
+ * quarter boundary.
|
|
+ *
|
|
+ * Finally, this top quarter guess gives us a good chance of being safe
|
|
+ * when running virtualised (and the CPUID maxphysaddr hasn't been
|
|
+ * levelled for heterogeneous migration safety), where the safety
|
|
+ * consideration is still in terms of host details, but all E820/etc
|
|
+ * information is in terms of guest physical layout.
|
|
+ */
|
|
+ l1tf_safe_maddr = max(l1tf_safe_maddr, ((l1d_maxphysaddr > paddr_bits)
|
|
+ ? (1ul << paddr_bits)
|
|
+ : (3ul << (paddr_bits - 2))));
|
|
+}
|
|
|
|
- if ( !force && (opt_xpti != OPT_XPTI_DEFAULT) )
|
|
- return;
|
|
+int8_t __read_mostly opt_xpti = -1;
|
|
|
|
+static __init void xpti_init_default(uint64_t caps)
|
|
+{
|
|
if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
|
|
caps = ARCH_CAPABILITIES_RDCL_NO;
|
|
- else if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
|
|
- rdmsrl(MSR_ARCH_CAPABILITIES, caps);
|
|
|
|
if ( caps & ARCH_CAPABILITIES_RDCL_NO )
|
|
opt_xpti = 0;
|
|
@@ -444,7 +675,13 @@ static __init int parse_xpti(const char *s)
|
|
const char *ss;
|
|
int val, rc = 0;
|
|
|
|
- xpti_init_default(false);
|
|
+ /* Inhibit the defaults as an explicit choice has been given. */
|
|
+ if ( opt_xpti == -1 )
|
|
+ opt_xpti = 0;
|
|
+
|
|
+ /* Interpret 'xpti' alone in its positive boolean form. */
|
|
+ if ( *s == '\0' )
|
|
+ opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU;
|
|
|
|
do {
|
|
ss = strchr(s, ',');
|
|
@@ -463,7 +700,7 @@ static __init int parse_xpti(const char *s)
|
|
|
|
default:
|
|
if ( !strcmp(s, "default") )
|
|
- xpti_init_default(true);
|
|
+ opt_xpti = -1;
|
|
else if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
|
|
opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) |
|
|
(val ? OPT_XPTI_DOM0 : 0);
|
|
@@ -625,12 +862,58 @@ void __init init_speculation_mitigations(void)
|
|
if ( default_xen_spec_ctrl )
|
|
setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE);
|
|
|
|
- xpti_init_default(false);
|
|
+ if ( opt_xpti == -1 )
|
|
+ xpti_init_default(caps);
|
|
+
|
|
if ( opt_xpti == 0 )
|
|
setup_force_cpu_cap(X86_FEATURE_NO_XPTI);
|
|
else
|
|
setup_clear_cpu_cap(X86_FEATURE_NO_XPTI);
|
|
|
|
+ l1tf_calculations(caps);
|
|
+
|
|
+ /*
|
|
+ * By default, enable PV domU L1TF mitigations on all L1TF-vulnerable
|
|
+ * hardware, except when running in shim mode.
|
|
+ *
|
|
+ * In shim mode, SHADOW is expected to be compiled out, and a malicious
|
|
+ * guest kernel can only attack the shim Xen, not the host Xen.
|
|
+ */
|
|
+ if ( opt_pv_l1tf == -1 )
|
|
+ {
|
|
+ if ( pv_shim || !cpu_has_bug_l1tf )
|
|
+ opt_pv_l1tf = 0;
|
|
+ else
|
|
+ opt_pv_l1tf = OPT_PV_L1TF_DOMU;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * By default, enable L1D_FLUSH on L1TF-vulnerable hardware, unless
|
|
+ * instructed to skip the flush on vmentry by our outer hypervisor.
|
|
+ */
|
|
+ if ( !boot_cpu_has(X86_FEATURE_L1D_FLUSH) )
|
|
+ opt_l1d_flush = 0;
|
|
+ else if ( opt_l1d_flush == -1 )
|
|
+ opt_l1d_flush = cpu_has_bug_l1tf && !(caps & ARCH_CAPS_SKIP_L1DFL);
|
|
+
|
|
+ /*
|
|
+ * We do not disable HT by default on affected hardware.
|
|
+ *
|
|
+ * Firstly, if the user intends to use exclusively PV, or HVM shadow
|
|
+ * guests, HT isn't a concern and should remain fully enabled. Secondly,
|
|
+ * safety for HVM HAP guests can be arranged by the toolstack with core
|
|
+ * parking, pinning or cpupool configurations, including mixed setups.
|
|
+ *
|
|
+ * However, if we are on affected hardware, with HT enabled, and the user
|
|
+ * hasn't explicitly chosen whether to use HT or not, nag them to do so.
|
|
+ */
|
|
+ if ( opt_smt == -1 && cpu_has_bug_l1tf && !pv_shim &&
|
|
+ boot_cpu_data.x86_num_siblings > 1 )
|
|
+ warning_add(
|
|
+ "Booted on L1TF-vulnerable hardware with SMT/Hyperthreading\n"
|
|
+ "enabled. Please assess your configuration and choose an\n"
|
|
+ "explicit 'smt=<bool>' setting. See XSA-273.\n");
|
|
+
|
|
print_details(thunk, caps);
|
|
|
|
/*
|
|
diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
|
|
index 166eb44fe2..2d70b45909 100644
|
|
--- a/xen/arch/x86/srat.c
|
|
+++ b/xen/arch/x86/srat.c
|
|
@@ -20,6 +20,7 @@
|
|
#include <xen/pfn.h>
|
|
#include <asm/e820.h>
|
|
#include <asm/page.h>
|
|
+#include <asm/spec_ctrl.h>
|
|
|
|
static struct acpi_table_slit *__read_mostly acpi_slit;
|
|
|
|
@@ -284,6 +285,11 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
|
|
if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
|
|
return;
|
|
|
|
+ start = ma->base_address;
|
|
+ end = start + ma->length;
|
|
+ /* Supplement the heuristics in l1tf_calculations(). */
|
|
+ l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE));
|
|
+
|
|
if (num_node_memblks >= NR_NODE_MEMBLKS)
|
|
{
|
|
dprintk(XENLOG_WARNING,
|
|
@@ -292,8 +298,6 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
|
|
return;
|
|
}
|
|
|
|
- start = ma->base_address;
|
|
- end = start + ma->length;
|
|
pxm = ma->proximity_domain;
|
|
if (srat_rev < 2)
|
|
pxm &= 0xff;
|
|
diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
|
|
index 4d372db12b..e704ed7f1c 100644
|
|
--- a/xen/arch/x86/sysctl.c
|
|
+++ b/xen/arch/x86/sysctl.c
|
|
@@ -23,6 +23,7 @@
|
|
#include <asm/hvm/hvm.h>
|
|
#include <asm/hvm/support.h>
|
|
#include <asm/processor.h>
|
|
+#include <asm/setup.h>
|
|
#include <asm/smp.h>
|
|
#include <asm/numa.h>
|
|
#include <xen/nodemask.h>
|
|
@@ -48,14 +49,27 @@ static void l3_cache_get(void *arg)
|
|
|
|
long cpu_up_helper(void *data)
|
|
{
|
|
- int cpu = (unsigned long)data;
|
|
+ unsigned int cpu = (unsigned long)data;
|
|
int ret = cpu_up(cpu);
|
|
+
|
|
if ( ret == -EBUSY )
|
|
{
|
|
/* On EBUSY, flush RCU work and have one more go. */
|
|
rcu_barrier();
|
|
ret = cpu_up(cpu);
|
|
}
|
|
+
|
|
+ if ( !ret && !opt_smt &&
|
|
+ cpu_data[cpu].compute_unit_id == INVALID_CUID &&
|
|
+ cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) > 1 )
|
|
+ {
|
|
+ ret = cpu_down_helper(data);
|
|
+ if ( ret )
|
|
+ printk("Could not re-offline CPU%u (%d)\n", cpu, ret);
|
|
+ else
|
|
+ ret = -EPERM;
|
|
+ }
|
|
+
|
|
return ret;
|
|
}
|
|
|
|
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
|
|
index 9f045a2045..789d7ff8cd 100644
|
|
--- a/xen/arch/x86/traps.c
|
|
+++ b/xen/arch/x86/traps.c
|
|
@@ -96,8 +96,6 @@ string_param("nmi", opt_nmi);
|
|
DEFINE_PER_CPU(uint64_t, efer);
|
|
static DEFINE_PER_CPU(unsigned long, last_extable_addr);
|
|
|
|
-DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
|
|
-
|
|
DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table);
|
|
DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table);
|
|
|
|
@@ -117,6 +115,9 @@ integer_param("debug_stack_lines", debug_stack_lines);
|
|
static bool opt_ler;
|
|
boolean_param("ler", opt_ler);
|
|
|
|
+/* LastExceptionFromIP on this hardware. Zero if LER is not in use. */
|
|
+unsigned int __read_mostly ler_msr;
|
|
+
|
|
#define stack_words_per_line 4
|
|
#define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
|
|
|
|
@@ -1778,17 +1779,6 @@ void do_device_not_available(struct cpu_user_regs *regs)
|
|
return;
|
|
}
|
|
|
|
-static void ler_enable(void)
|
|
-{
|
|
- u64 debugctl;
|
|
-
|
|
- if ( !this_cpu(ler_msr) )
|
|
- return;
|
|
-
|
|
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
|
|
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR);
|
|
-}
|
|
-
|
|
void do_debug(struct cpu_user_regs *regs)
|
|
{
|
|
unsigned long dr6;
|
|
@@ -1821,6 +1811,10 @@ void do_debug(struct cpu_user_regs *regs)
|
|
*/
|
|
write_debugreg(6, X86_DR6_DEFAULT);
|
|
|
|
+ /* #DB automatically disabled LBR. Reinstate it if debugging Xen. */
|
|
+ if ( cpu_has_xen_lbr )
|
|
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
|
|
+
|
|
if ( !guest_mode(regs) )
|
|
{
|
|
/*
|
|
@@ -1838,7 +1832,7 @@ void do_debug(struct cpu_user_regs *regs)
|
|
{
|
|
if ( regs->rip == (unsigned long)sysenter_eflags_saved )
|
|
regs->eflags &= ~X86_EFLAGS_TF;
|
|
- goto out;
|
|
+ return;
|
|
}
|
|
if ( !debugger_trap_fatal(TRAP_debug, regs) )
|
|
{
|
|
@@ -1895,20 +1889,14 @@ void do_debug(struct cpu_user_regs *regs)
|
|
regs->cs, _p(regs->rip), _p(regs->rip),
|
|
regs->ss, _p(regs->rsp), dr6);
|
|
|
|
- goto out;
|
|
+ return;
|
|
}
|
|
|
|
/* Save debug status register where guest OS can peek at it */
|
|
v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT);
|
|
v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT);
|
|
|
|
- ler_enable();
|
|
pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
|
|
- return;
|
|
-
|
|
- out:
|
|
- ler_enable();
|
|
- return;
|
|
}
|
|
|
|
static void __init noinline __set_intr_gate(unsigned int n,
|
|
@@ -1952,38 +1940,46 @@ void load_TR(void)
|
|
: "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
|
|
}
|
|
|
|
-void percpu_traps_init(void)
|
|
+static unsigned int calc_ler_msr(void)
|
|
{
|
|
- subarch_percpu_traps_init();
|
|
-
|
|
- if ( !opt_ler )
|
|
- return;
|
|
-
|
|
switch ( boot_cpu_data.x86_vendor )
|
|
{
|
|
case X86_VENDOR_INTEL:
|
|
switch ( boot_cpu_data.x86 )
|
|
{
|
|
case 6:
|
|
- this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
|
|
- break;
|
|
+ return MSR_IA32_LASTINTFROMIP;
|
|
+
|
|
case 15:
|
|
- this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
|
|
- break;
|
|
+ return MSR_P4_LER_FROM_LIP;
|
|
}
|
|
break;
|
|
+
|
|
case X86_VENDOR_AMD:
|
|
switch ( boot_cpu_data.x86 )
|
|
{
|
|
case 6:
|
|
case 0xf ... 0x17:
|
|
- this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
|
|
- break;
|
|
+ return MSR_IA32_LASTINTFROMIP;
|
|
}
|
|
break;
|
|
}
|
|
|
|
- ler_enable();
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void percpu_traps_init(void)
|
|
+{
|
|
+ subarch_percpu_traps_init();
|
|
+
|
|
+ if ( !opt_ler )
|
|
+ return;
|
|
+
|
|
+ if ( !ler_msr && (ler_msr = calc_ler_msr()) )
|
|
+ setup_force_cpu_cap(X86_FEATURE_XEN_LBR);
|
|
+
|
|
+ if ( cpu_has_xen_lbr )
|
|
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
|
|
}
|
|
|
|
void __init init_idt_traps(void)
|
|
diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c
|
|
index c2aa6f2fdb..02bc75b91e 100644
|
|
--- a/xen/arch/x86/x86_64/compat/mm.c
|
|
+++ b/xen/arch/x86/x86_64/compat/mm.c
|
|
@@ -163,19 +163,6 @@ int compat_arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
|
|
return rc;
|
|
}
|
|
|
|
-int compat_update_va_mapping(unsigned int va, u32 lo, u32 hi,
|
|
- unsigned int flags)
|
|
-{
|
|
- return do_update_va_mapping(va, lo | ((u64)hi << 32), flags);
|
|
-}
|
|
-
|
|
-int compat_update_va_mapping_otherdomain(unsigned long va, u32 lo, u32 hi,
|
|
- unsigned long flags,
|
|
- domid_t domid)
|
|
-{
|
|
- return do_update_va_mapping_otherdomain(va, lo | ((u64)hi << 32), flags, domid);
|
|
-}
|
|
-
|
|
DEFINE_XEN_GUEST_HANDLE(mmuext_op_compat_t);
|
|
|
|
int compat_mmuext_op(XEN_GUEST_HANDLE_PARAM(void) arg,
|
|
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
|
|
index f7f6928d70..b0401850ef 100644
|
|
--- a/xen/arch/x86/x86_64/traps.c
|
|
+++ b/xen/arch/x86/x86_64/traps.c
|
|
@@ -144,11 +144,12 @@ void show_registers(const struct cpu_user_regs *regs)
|
|
printk("CPU: %d\n", smp_processor_id());
|
|
_show_registers(&fault_regs, fault_crs, context, v);
|
|
|
|
- if ( this_cpu(ler_msr) && !guest_mode(regs) )
|
|
+ if ( ler_msr && !guest_mode(regs) )
|
|
{
|
|
u64 from, to;
|
|
- rdmsrl(this_cpu(ler_msr), from);
|
|
- rdmsrl(this_cpu(ler_msr) + 1, to);
|
|
+
|
|
+ rdmsrl(ler_msr, from);
|
|
+ rdmsrl(ler_msr + 1, to);
|
|
printk("ler: %016lx -> %016lx\n", from, to);
|
|
}
|
|
}
|
|
diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c
|
|
index b4aea4b50a..15edd5df96 100644
|
|
--- a/xen/arch/x86/xstate.c
|
|
+++ b/xen/arch/x86/xstate.c
|
|
@@ -670,12 +670,17 @@ static bool valid_xcr0(u64 xcr0)
|
|
return !(xcr0 & X86_XCR0_BNDREGS) == !(xcr0 & X86_XCR0_BNDCSR);
|
|
}
|
|
|
|
-int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr)
|
|
+int validate_xstate(const struct domain *d, uint64_t xcr0, uint64_t xcr0_accum,
|
|
+ const struct xsave_hdr *hdr)
|
|
{
|
|
+ const struct cpuid_policy *cp = d->arch.cpuid;
|
|
+ uint64_t xcr0_max =
|
|
+ ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low;
|
|
unsigned int i;
|
|
|
|
if ( (hdr->xstate_bv & ~xcr0_accum) ||
|
|
(xcr0 & ~xcr0_accum) ||
|
|
+ (xcr0_accum & ~xcr0_max) ||
|
|
!valid_xcr0(xcr0) ||
|
|
!valid_xcr0(xcr0_accum) )
|
|
return -EINVAL;
|
|
@@ -694,20 +699,40 @@ int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr)
|
|
int handle_xsetbv(u32 index, u64 new_bv)
|
|
{
|
|
struct vcpu *curr = current;
|
|
+ const struct cpuid_policy *cp = curr->domain->arch.cpuid;
|
|
+ uint64_t xcr0_max =
|
|
+ ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low;
|
|
u64 mask;
|
|
|
|
if ( index != XCR_XFEATURE_ENABLED_MASK )
|
|
return -EOPNOTSUPP;
|
|
|
|
- if ( (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) )
|
|
+ /*
|
|
+ * The CPUID logic shouldn't be able to hand out an XCR0 exceeding Xen's
|
|
+ * maximum features, but keep the check for robustness.
|
|
+ */
|
|
+ if ( unlikely(xcr0_max & ~xfeature_mask) )
|
|
+ {
|
|
+ gprintk(XENLOG_ERR,
|
|
+ "xcr0_max %016" PRIx64 " exceeds hardware max %016" PRIx64 "\n",
|
|
+ xcr0_max, xfeature_mask);
|
|
+ domain_crash(curr->domain);
|
|
+
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ if ( (new_bv & ~xcr0_max) || !valid_xcr0(new_bv) )
|
|
return -EINVAL;
|
|
|
|
- /* XCR0.PKRU is disabled on PV mode. */
|
|
- if ( is_pv_vcpu(curr) && (new_bv & X86_XCR0_PKRU) )
|
|
- return -EOPNOTSUPP;
|
|
+ /* By this point, new_bv really should be accepted by hardware. */
|
|
+ if ( unlikely(!set_xcr0(new_bv)) )
|
|
+ {
|
|
+ gprintk(XENLOG_ERR, "new_bv %016" PRIx64 " rejected by hardware\n",
|
|
+ new_bv);
|
|
+ domain_crash(curr->domain);
|
|
|
|
- if ( !set_xcr0(new_bv) )
|
|
return -EFAULT;
|
|
+ }
|
|
|
|
mask = new_bv & ~curr->arch.xcr0_accum;
|
|
curr->arch.xcr0 = new_bv;
|
|
diff --git a/xen/common/cpu.c b/xen/common/cpu.c
|
|
index 6350f150bd..653a56b840 100644
|
|
--- a/xen/common/cpu.c
|
|
+++ b/xen/common/cpu.c
|
|
@@ -67,12 +67,17 @@ void __init register_cpu_notifier(struct notifier_block *nb)
|
|
spin_unlock(&cpu_add_remove_lock);
|
|
}
|
|
|
|
-static int take_cpu_down(void *unused)
|
|
+static void _take_cpu_down(void *unused)
|
|
{
|
|
void *hcpu = (void *)(long)smp_processor_id();
|
|
int notifier_rc = notifier_call_chain(&cpu_chain, CPU_DYING, hcpu, NULL);
|
|
BUG_ON(notifier_rc != NOTIFY_DONE);
|
|
__cpu_disable();
|
|
+}
|
|
+
|
|
+static int take_cpu_down(void *arg)
|
|
+{
|
|
+ _take_cpu_down(arg);
|
|
return 0;
|
|
}
|
|
|
|
@@ -98,7 +103,9 @@ int cpu_down(unsigned int cpu)
|
|
goto fail;
|
|
}
|
|
|
|
- if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
|
|
+ if ( unlikely(system_state < SYS_STATE_active) )
|
|
+ on_selected_cpus(cpumask_of(cpu), _take_cpu_down, NULL, true);
|
|
+ else if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
|
|
goto fail;
|
|
|
|
__cpu_die(cpu);
|
|
diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
|
|
index 999839444e..1e8edcbd57 100644
|
|
--- a/xen/common/cpupool.c
|
|
+++ b/xen/common/cpupool.c
|
|
@@ -490,7 +490,7 @@ static int cpupool_cpu_add(unsigned int cpu)
|
|
cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
|
|
cpumask_set_cpu(cpu, &cpupool_free_cpus);
|
|
|
|
- if ( system_state == SYS_STATE_resume )
|
|
+ if ( system_state == SYS_STATE_suspend || system_state == SYS_STATE_resume )
|
|
{
|
|
struct cpupool **c;
|
|
|
|
@@ -522,6 +522,7 @@ static int cpupool_cpu_add(unsigned int cpu)
|
|
* (or unplugging would have failed) and that is the default behavior
|
|
* anyway.
|
|
*/
|
|
+ per_cpu(cpupool, cpu) = NULL;
|
|
ret = cpupool_assign_cpu_locked(cpupool0, cpu);
|
|
}
|
|
out:
|
|
diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
|
|
index 64d12685d3..6be0b3986f 100644
|
|
--- a/xen/common/efi/boot.c
|
|
+++ b/xen/common/efi/boot.c
|
|
@@ -1304,6 +1304,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
|
|
|
|
#ifndef CONFIG_ARM /* TODO - runtime service support */
|
|
|
|
+#include <asm/spec_ctrl.h>
|
|
+
|
|
static bool __initdata efi_map_uc;
|
|
|
|
static int __init parse_efi_param(const char *s)
|
|
@@ -1419,6 +1421,16 @@ void __init efi_init_memory(void)
|
|
desc->PhysicalStart, desc->PhysicalStart + len - 1,
|
|
desc->Type, desc->Attribute);
|
|
|
|
+ if ( (desc->Attribute & (EFI_MEMORY_WB | EFI_MEMORY_WT)) ||
|
|
+ (efi_bs_revision >= EFI_REVISION(2, 5) &&
|
|
+ (desc->Attribute & EFI_MEMORY_WP)) )
|
|
+ {
|
|
+ /* Supplement the heuristics in l1tf_calculations(). */
|
|
+ l1tf_safe_maddr =
|
|
+ max(l1tf_safe_maddr,
|
|
+ ROUNDUP(desc->PhysicalStart + len, PAGE_SIZE));
|
|
+ }
|
|
+
|
|
if ( !efi_enabled(EFI_RS) ||
|
|
(!(desc->Attribute & EFI_MEMORY_RUNTIME) &&
|
|
(!map_bs ||
|
|
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
|
|
index c757b7f6f5..231ecf509a 100644
|
|
--- a/xen/common/grant_table.c
|
|
+++ b/xen/common/grant_table.c
|
|
@@ -97,7 +97,11 @@ static unsigned int __read_mostly max_maptrack_frames =
|
|
DEFAULT_MAX_MAPTRACK_FRAMES;
|
|
integer_runtime_param("gnttab_max_maptrack_frames", max_maptrack_frames);
|
|
|
|
-static unsigned int __read_mostly opt_gnttab_max_version = 2;
|
|
+#ifndef GNTTAB_MAX_VERSION
|
|
+#define GNTTAB_MAX_VERSION 2
|
|
+#endif
|
|
+
|
|
+static unsigned int __read_mostly opt_gnttab_max_version = GNTTAB_MAX_VERSION;
|
|
static bool __read_mostly opt_transitive_grants = true;
|
|
|
|
static int __init parse_gnttab(const char *s)
|
|
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
|
|
index 20ee1e4897..02aeed7c47 100644
|
|
--- a/xen/common/page_alloc.c
|
|
+++ b/xen/common/page_alloc.c
|
|
@@ -1426,7 +1426,7 @@ static void free_heap_pages(
|
|
|
|
page_list_del(predecessor, &heap(node, zone, order));
|
|
|
|
- /* Keep predecessor's first_dirty if it is already set. */
|
|
+ /* Update predecessor's first_dirty if necessary. */
|
|
if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
|
|
pg->u.free.first_dirty != INVALID_DIRTY_IDX )
|
|
predecessor->u.free.first_dirty = (1U << order) +
|
|
@@ -1447,6 +1447,12 @@ static void free_heap_pages(
|
|
|
|
check_and_stop_scrub(successor);
|
|
|
|
+ /* Update pg's first_dirty if necessary. */
|
|
+ if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX &&
|
|
+ successor->u.free.first_dirty != INVALID_DIRTY_IDX )
|
|
+ pg->u.free.first_dirty = (1U << order) +
|
|
+ successor->u.free.first_dirty;
|
|
+
|
|
page_list_del(successor, &heap(node, zone, order));
|
|
}
|
|
|
|
diff --git a/xen/common/tasklet.c b/xen/common/tasklet.c
|
|
index 0f0a6f8365..d4fea3151c 100644
|
|
--- a/xen/common/tasklet.c
|
|
+++ b/xen/common/tasklet.c
|
|
@@ -156,6 +156,10 @@ void tasklet_kill(struct tasklet *t)
|
|
|
|
spin_lock_irqsave(&tasklet_lock, flags);
|
|
|
|
+ /* Cope with uninitialised tasklets. */
|
|
+ if ( list_head_is_null(&t->list) )
|
|
+ goto unlock;
|
|
+
|
|
if ( !list_empty(&t->list) )
|
|
{
|
|
BUG_ON(t->is_dead || t->is_running || (t->scheduled_on < 0));
|
|
@@ -172,6 +176,7 @@ void tasklet_kill(struct tasklet *t)
|
|
spin_lock_irqsave(&tasklet_lock, flags);
|
|
}
|
|
|
|
+ unlock:
|
|
spin_unlock_irqrestore(&tasklet_lock, flags);
|
|
}
|
|
|
|
diff --git a/xen/include/asm-arm/arm32/system.h b/xen/include/asm-arm/arm32/system.h
|
|
index c617b40438..ab57abfbc5 100644
|
|
--- a/xen/include/asm-arm/arm32/system.h
|
|
+++ b/xen/include/asm-arm/arm32/system.h
|
|
@@ -48,6 +48,24 @@ static inline int local_fiq_is_enabled(void)
|
|
return !(flags & PSR_FIQ_MASK);
|
|
}
|
|
|
|
+#define CSDB ".inst 0xe320f014"
|
|
+
|
|
+static inline unsigned long array_index_mask_nospec(unsigned long idx,
|
|
+ unsigned long sz)
|
|
+{
|
|
+ unsigned long mask;
|
|
+
|
|
+ asm volatile( "cmp %1, %2\n"
|
|
+ "sbc %0, %1, %1\n"
|
|
+ CSDB
|
|
+ : "=r" (mask)
|
|
+ : "r" (idx), "Ir" (sz)
|
|
+ : "cc" );
|
|
+
|
|
+ return mask;
|
|
+}
|
|
+#define array_index_mask_nospec array_index_mask_nospec
|
|
+
|
|
#endif
|
|
/*
|
|
* Local variables:
|
|
diff --git a/xen/include/asm-arm/arm64/system.h b/xen/include/asm-arm/arm64/system.h
|
|
index 2e2ee212a1..2e36573ac6 100644
|
|
--- a/xen/include/asm-arm/arm64/system.h
|
|
+++ b/xen/include/asm-arm/arm64/system.h
|
|
@@ -58,6 +58,28 @@ static inline int local_fiq_is_enabled(void)
|
|
return !(flags & PSR_FIQ_MASK);
|
|
}
|
|
|
|
+#define csdb() asm volatile ( "hint #20" : : : "memory" )
|
|
+
|
|
+/*
|
|
+ * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz
|
|
+ * and 0 otherwise.
|
|
+ */
|
|
+static inline unsigned long array_index_mask_nospec(unsigned long idx,
|
|
+ unsigned long sz)
|
|
+{
|
|
+ unsigned long mask;
|
|
+
|
|
+ asm volatile ( "cmp %1, %2\n"
|
|
+ "sbc %0, xzr, xzr\n"
|
|
+ : "=r" (mask)
|
|
+ : "r" (idx), "Ir" (sz)
|
|
+ : "cc" );
|
|
+ csdb();
|
|
+
|
|
+ return mask;
|
|
+}
|
|
+#define array_index_mask_nospec array_index_mask_nospec
|
|
+
|
|
#endif
|
|
/*
|
|
* Local variables:
|
|
diff --git a/xen/include/asm-arm/grant_table.h b/xen/include/asm-arm/grant_table.h
|
|
index e52936c79f..24958e4670 100644
|
|
--- a/xen/include/asm-arm/grant_table.h
|
|
+++ b/xen/include/asm-arm/grant_table.h
|
|
@@ -7,6 +7,7 @@
|
|
#include <xen/sched.h>
|
|
|
|
#define INITIAL_NR_GRANT_FRAMES 1U
|
|
+#define GNTTAB_MAX_VERSION 1
|
|
|
|
struct grant_table_arch {
|
|
gfn_t *shared_gfn;
|
|
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
|
|
index 2cf8f7ea2a..b237da165c 100644
|
|
--- a/xen/include/asm-x86/cpufeature.h
|
|
+++ b/xen/include/asm-x86/cpufeature.h
|
|
@@ -113,6 +113,7 @@
|
|
#define cpu_has_aperfmperf boot_cpu_has(X86_FEATURE_APERFMPERF)
|
|
#define cpu_has_lfence_dispatch boot_cpu_has(X86_FEATURE_LFENCE_DISPATCH)
|
|
#define cpu_has_no_xpti boot_cpu_has(X86_FEATURE_NO_XPTI)
|
|
+#define cpu_has_xen_lbr boot_cpu_has(X86_FEATURE_XEN_LBR)
|
|
|
|
enum _cache_type {
|
|
CACHE_TYPE_NULL = 0,
|
|
diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
|
|
index b90aa2d046..8e5cc53dde 100644
|
|
--- a/xen/include/asm-x86/cpufeatures.h
|
|
+++ b/xen/include/asm-x86/cpufeatures.h
|
|
@@ -32,3 +32,4 @@ XEN_CPUFEATURE(SC_RSB_PV, (FSCAPINTS+0)*32+18) /* RSB overwrite needed for
|
|
XEN_CPUFEATURE(SC_RSB_HVM, (FSCAPINTS+0)*32+19) /* RSB overwrite needed for HVM */
|
|
XEN_CPUFEATURE(NO_XPTI, (FSCAPINTS+0)*32+20) /* XPTI mitigation not in use */
|
|
XEN_CPUFEATURE(SC_MSR_IDLE, (FSCAPINTS+0)*32+21) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */
|
|
+XEN_CPUFEATURE(XEN_LBR, (FSCAPINTS+0)*32+22) /* Xen uses MSR_DEBUGCTL.LBR */
|
|
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
|
|
index e0d413c7de..61e6900465 100644
|
|
--- a/xen/include/asm-x86/domain.h
|
|
+++ b/xen/include/asm-x86/domain.h
|
|
@@ -121,6 +121,11 @@ struct shadow_domain {
|
|
|
|
/* Has this domain ever used HVMOP_pagetable_dying? */
|
|
bool_t pagetable_dying_op;
|
|
+
|
|
+#ifdef CONFIG_PV
|
|
+ /* PV L1 Terminal Fault mitigation. */
|
|
+ struct tasklet pv_l1tf_tasklet;
|
|
+#endif /* CONFIG_PV */
|
|
#endif
|
|
};
|
|
|
|
@@ -257,6 +262,8 @@ struct pv_domain
|
|
bool xpti;
|
|
/* Use PCID feature? */
|
|
bool pcid;
|
|
+ /* Mitigate L1TF with shadow/crashing? */
|
|
+ bool check_l1tf;
|
|
|
|
/* map_domain_page() mapping cache. */
|
|
struct mapcache_domain mapcache;
|
|
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
|
|
index 06c3179cec..57e5098b99 100644
|
|
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
|
|
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
|
|
@@ -130,10 +130,18 @@ struct arch_vmx_struct {
|
|
uint64_t sfmask;
|
|
|
|
struct vmx_msr_bitmap *msr_bitmap;
|
|
- unsigned int msr_count;
|
|
+
|
|
+ /*
|
|
+ * Most accesses to the MSR host/guest load/save lists are in current
|
|
+ * context. However, the data can be modified by toolstack/migration
|
|
+ * actions. Remote access is only permitted for paused vcpus, and is
|
|
+ * protected under the domctl lock.
|
|
+ */
|
|
struct vmx_msr_entry *msr_area;
|
|
- unsigned int host_msr_count;
|
|
struct vmx_msr_entry *host_msr_area;
|
|
+ unsigned int msr_load_count;
|
|
+ unsigned int msr_save_count;
|
|
+ unsigned int host_msr_count;
|
|
|
|
unsigned long eoi_exitmap_changed;
|
|
DECLARE_BITMAP(eoi_exit_bitmap, NR_VECTORS);
|
|
@@ -149,7 +157,7 @@ struct arch_vmx_struct {
|
|
/* Are we emulating rather than VMENTERing? */
|
|
uint8_t vmx_emulate;
|
|
|
|
- uint8_t lbr_fixup_enabled;
|
|
+ uint8_t lbr_flags;
|
|
|
|
/* Bitmask of segments that we can't safely use in virtual 8086 mode */
|
|
uint16_t vm86_segment_mask;
|
|
@@ -514,9 +522,6 @@ enum vmcs_field {
|
|
|
|
#define VMCS_VPID_WIDTH 16
|
|
|
|
-#define VMX_GUEST_MSR 0
|
|
-#define VMX_HOST_MSR 1
|
|
-
|
|
/* VM Instruction error numbers */
|
|
enum vmx_insn_errno
|
|
{
|
|
@@ -534,6 +539,67 @@ enum vmx_insn_errno
|
|
VMX_INSN_FAIL_INVALID = ~0,
|
|
};
|
|
|
|
+/* MSR load/save list infrastructure. */
|
|
+enum vmx_msr_list_type {
|
|
+ VMX_MSR_HOST, /* MSRs loaded on VMExit. */
|
|
+ VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */
|
|
+ VMX_MSR_GUEST_LOADONLY, /* MSRs loaded on VMEntry only. */
|
|
+};
|
|
+
|
|
+/**
|
|
+ * Add an MSR to an MSR list (inserting space for the entry if necessary), and
|
|
+ * set the MSRs value.
|
|
+ *
|
|
+ * It is undefined behaviour to try and insert the same MSR into both the
|
|
+ * GUEST and GUEST_LOADONLY list.
|
|
+ *
|
|
+ * May fail if unable to allocate memory for the list, or the total number of
|
|
+ * entries exceeds the memory allocated.
|
|
+ */
|
|
+int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
|
|
+ enum vmx_msr_list_type type);
|
|
+
|
|
+static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr, uint64_t val)
|
|
+{
|
|
+ return vmx_add_msr(v, msr, val, VMX_MSR_GUEST);
|
|
+}
|
|
+static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr,
|
|
+ uint64_t val)
|
|
+{
|
|
+ return vmx_add_msr(v, msr, val, VMX_MSR_HOST);
|
|
+}
|
|
+
|
|
+struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
|
|
+ enum vmx_msr_list_type type);
|
|
+
|
|
+static inline int vmx_read_guest_msr(const struct vcpu *v, uint32_t msr,
|
|
+ uint64_t *val)
|
|
+{
|
|
+ const struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST);
|
|
+
|
|
+ if ( !ent )
|
|
+ return -ESRCH;
|
|
+
|
|
+ *val = ent->data;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline int vmx_write_guest_msr(struct vcpu *v, uint32_t msr,
|
|
+ uint64_t val)
|
|
+{
|
|
+ struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST);
|
|
+
|
|
+ if ( !ent )
|
|
+ return -ESRCH;
|
|
+
|
|
+ ent->data = val;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+
|
|
+/* MSR intercept bitmap infrastructure. */
|
|
enum vmx_msr_intercept_type {
|
|
VMX_MSR_R = 1,
|
|
VMX_MSR_W = 2,
|
|
@@ -544,10 +610,6 @@ void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr,
|
|
enum vmx_msr_intercept_type type);
|
|
void vmx_set_msr_intercept(struct vcpu *v, unsigned int msr,
|
|
enum vmx_msr_intercept_type type);
|
|
-int vmx_read_guest_msr(u32 msr, u64 *val);
|
|
-int vmx_write_guest_msr(u32 msr, u64 val);
|
|
-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type);
|
|
-int vmx_add_msr(u32 msr, int type);
|
|
void vmx_vmcs_switch(paddr_t from, paddr_t to);
|
|
void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector);
|
|
void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector);
|
|
@@ -562,15 +624,6 @@ void virtual_vmcs_vmwrite(const struct vcpu *, u32 encoding, u64 val);
|
|
enum vmx_insn_errno virtual_vmcs_vmwrite_safe(const struct vcpu *v,
|
|
u32 vmcs_encoding, u64 val);
|
|
|
|
-static inline int vmx_add_guest_msr(u32 msr)
|
|
-{
|
|
- return vmx_add_msr(msr, VMX_GUEST_MSR);
|
|
-}
|
|
-static inline int vmx_add_host_load_msr(u32 msr)
|
|
-{
|
|
- return vmx_add_msr(msr, VMX_HOST_MSR);
|
|
-}
|
|
-
|
|
DECLARE_PER_CPU(bool_t, vmxon);
|
|
|
|
bool_t vmx_vcpu_pml_enabled(const struct vcpu *v);
|
|
diff --git a/xen/include/asm-x86/hypercall.h b/xen/include/asm-x86/hypercall.h
|
|
index 1cc2e37d5c..da38b7991c 100644
|
|
--- a/xen/include/asm-x86/hypercall.h
|
|
+++ b/xen/include/asm-x86/hypercall.h
|
|
@@ -165,7 +165,7 @@ extern int compat_update_va_mapping(
|
|
unsigned int va, u32 lo, u32 hi, unsigned int flags);
|
|
|
|
extern int compat_update_va_mapping_otherdomain(
|
|
- unsigned long va, u32 lo, u32 hi, unsigned long flags, domid_t domid);
|
|
+ unsigned int va, u32 lo, u32 hi, unsigned int flags, domid_t domid);
|
|
|
|
DEFINE_XEN_GUEST_HANDLE(trap_info_compat_t);
|
|
extern int compat_set_trap_table(XEN_GUEST_HANDLE(trap_info_compat_t) traps);
|
|
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
|
|
index 8fbccc88a7..7235623c86 100644
|
|
--- a/xen/include/asm-x86/msr-index.h
|
|
+++ b/xen/include/asm-x86/msr-index.h
|
|
@@ -47,8 +47,12 @@
|
|
#define ARCH_CAPABILITIES_RDCL_NO (_AC(1, ULL) << 0)
|
|
#define ARCH_CAPABILITIES_IBRS_ALL (_AC(1, ULL) << 1)
|
|
#define ARCH_CAPS_RSBA (_AC(1, ULL) << 2)
|
|
+#define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3)
|
|
#define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
|
|
|
|
+#define MSR_FLUSH_CMD 0x0000010b
|
|
+#define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
|
|
+
|
|
/* Intel MSRs. Some also available on other CPUs */
|
|
#define MSR_IA32_PERFCTR0 0x000000c1
|
|
#define MSR_IA32_A_PERFCTR0 0x000004c1
|
|
diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
|
|
index f14f265aa5..afbeb7f155 100644
|
|
--- a/xen/include/asm-x86/msr.h
|
|
+++ b/xen/include/asm-x86/msr.h
|
|
@@ -241,7 +241,7 @@ static inline void write_efer(uint64_t val)
|
|
wrmsrl(MSR_EFER, val);
|
|
}
|
|
|
|
-DECLARE_PER_CPU(u32, ler_msr);
|
|
+extern unsigned int ler_msr;
|
|
|
|
DECLARE_PER_CPU(uint32_t, tsc_aux);
|
|
|
|
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
|
|
index f0085511c7..f440e3e53c 100644
|
|
--- a/xen/include/asm-x86/paging.h
|
|
+++ b/xen/include/asm-x86/paging.h
|
|
@@ -37,11 +37,14 @@
|
|
|
|
#define PG_SH_shift 20
|
|
#define PG_HAP_shift 21
|
|
+#define PG_SHF_shift 22
|
|
/* We're in one of the shadow modes */
|
|
#ifdef CONFIG_SHADOW_PAGING
|
|
#define PG_SH_enable (1U << PG_SH_shift)
|
|
+#define PG_SH_forced (1U << PG_SHF_shift)
|
|
#else
|
|
#define PG_SH_enable 0
|
|
+#define PG_SH_forced 0
|
|
#endif
|
|
#define PG_HAP_enable (1U << PG_HAP_shift)
|
|
|
|
@@ -62,6 +65,7 @@
|
|
|
|
#define paging_mode_enabled(_d) (!!(_d)->arch.paging.mode)
|
|
#define paging_mode_shadow(_d) (!!((_d)->arch.paging.mode & PG_SH_enable))
|
|
+#define paging_mode_sh_forced(_d) (!!((_d)->arch.paging.mode & PG_SH_forced))
|
|
#define paging_mode_hap(_d) (!!((_d)->arch.paging.mode & PG_HAP_enable))
|
|
|
|
#define paging_mode_refcounts(_d) (!!((_d)->arch.paging.mode & PG_refcounts))
|
|
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
|
|
index 9924cdf1f3..2bd9e69684 100644
|
|
--- a/xen/include/asm-x86/processor.h
|
|
+++ b/xen/include/asm-x86/processor.h
|
|
@@ -337,12 +337,6 @@ static always_inline void set_in_cr4 (unsigned long mask)
|
|
write_cr4(read_cr4() | mask);
|
|
}
|
|
|
|
-static always_inline void clear_in_cr4 (unsigned long mask)
|
|
-{
|
|
- mmu_cr4_features &= ~mask;
|
|
- write_cr4(read_cr4() & ~mask);
|
|
-}
|
|
-
|
|
static inline unsigned int read_pkru(void)
|
|
{
|
|
unsigned int pkru;
|
|
diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
|
|
index 19232afa01..c09a5ff381 100644
|
|
--- a/xen/include/asm-x86/setup.h
|
|
+++ b/xen/include/asm-x86/setup.h
|
|
@@ -66,6 +66,8 @@ extern uint8_t kbd_shift_flags;
|
|
extern unsigned long highmem_start;
|
|
#endif
|
|
|
|
+extern int8_t opt_smt;
|
|
+
|
|
#ifdef CONFIG_SHADOW_PAGING
|
|
extern bool opt_dom0_shadow;
|
|
#else
|
|
diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
|
|
index 94a34fd16a..f40f411871 100644
|
|
--- a/xen/include/asm-x86/shadow.h
|
|
+++ b/xen/include/asm-x86/shadow.h
|
|
@@ -29,6 +29,7 @@
|
|
#include <asm/flushtlb.h>
|
|
#include <asm/paging.h>
|
|
#include <asm/p2m.h>
|
|
+#include <asm/spec_ctrl.h>
|
|
|
|
/*****************************************************************************
|
|
* Macros to tell which shadow paging mode a domain is in*/
|
|
@@ -115,6 +116,131 @@ static inline int shadow_domctl(struct domain *d,
|
|
|
|
#endif /* CONFIG_SHADOW_PAGING */
|
|
|
|
+/*
|
|
+ * Mitigations for L1TF / CVE-2018-3620 for PV guests.
|
|
+ *
|
|
+ * We cannot alter an architecturally-legitimate PTE which a PV guest has
|
|
+ * chosen to write, as traditional paged-out metadata is L1TF-vulnerable.
|
|
+ * What we can do is force a PV guest which writes a vulnerable PTE into
|
|
+ * shadow mode, so Xen controls the pagetables which are reachable by the CPU
|
|
+ * pagewalk.
|
|
+ *
|
|
+ * The core of the L1TF vulnerability is that the address bits of the PTE
|
|
+ * (accounting for PSE and factoring in the level-relevant part of the linear
|
|
+ * access) are sent for an L1D lookup (to retrieve the next-level PTE, or
|
|
+ * eventual memory address) before the Present or reserved bits (which would
|
|
+ * cause a terminal fault) are accounted for. If an L1D hit occurs, the
|
|
+ * resulting data is available for potentially dependent instructions.
|
|
+ *
|
|
+ * For Present PTEs, the PV type-count safety logic ensures that the address
|
|
+ * bits always point at a guest-accessible frame, which is safe WRT L1TF from
|
|
+ * Xen's point of view. In practice, a PV guest should be unable to set any
|
|
+ * reserved bits, so should be unable to create any present L1TF-vulnerable
|
|
+ * PTEs at all.
|
|
+ *
|
|
+ * Therefore, these safety checks apply to Not-Present PTEs only, where
|
|
+ * traditionally, Xen would have let the guest write any value it chose.
|
|
+ *
|
|
+ * The all-zero PTE potentially leaks mfn 0. All software on the system is
|
|
+ * expected to cooperate and not put any secrets there. In a Xen system,
|
|
+ * neither Xen nor dom0 are expected to touch mfn 0, as it typically contains
|
|
+ * the real mode IVT and Bios Data Area. Therefore, mfn 0 is considered safe.
|
|
+ *
|
|
+ * Any PTE whose address is higher than the maximum cacheable address is safe,
|
|
+ * as it won't get an L1D hit.
|
|
+ *
|
|
+ * Speculative superpages also need accounting for, as PSE is considered
|
|
+ * irrespective of Present. We disallow PSE being set, as it allows an
|
|
+ * attacker to leak 2M or 1G of data starting from mfn 0. Also, because of
|
|
+ * recursive/linear pagetables, we must consider PSE even at L4, as hardware
|
|
+ * will interpret an L4e as an L3e during a recursive walk.
|
|
+ */
|
|
+
|
|
+static inline bool is_l1tf_safe_maddr(intpte_t pte)
|
|
+{
|
|
+ paddr_t maddr = pte & l1tf_addr_mask;
|
|
+
|
|
+ return maddr == 0 || maddr >= l1tf_safe_maddr;
|
|
+}
|
|
+
|
|
+static inline bool pv_l1tf_check_pte(struct domain *d, unsigned int level,
|
|
+ intpte_t pte)
|
|
+{
|
|
+ ASSERT(is_pv_domain(d));
|
|
+ ASSERT(!(pte & _PAGE_PRESENT));
|
|
+
|
|
+ if ( d->arch.pv_domain.check_l1tf && !paging_mode_sh_forced(d) &&
|
|
+ (((level > 1) && (pte & _PAGE_PSE)) || !is_l1tf_safe_maddr(pte)) )
|
|
+ {
|
|
+#ifdef CONFIG_SHADOW_PAGING
|
|
+ struct tasklet *t = &d->arch.paging.shadow.pv_l1tf_tasklet;
|
|
+
|
|
+ printk(XENLOG_G_WARNING
|
|
+ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Shadowing\n",
|
|
+ d->domain_id, level, pte);
|
|
+ /*
|
|
+ * Safety consideration for accessing tasklet.scheduled_on without the
|
|
+ * tasklet lock. This is a singleshot tasklet with the side effect of
|
|
+ * setting PG_SH_forced (checked just above). Multiple vcpus can race
|
|
+ * to schedule the tasklet, but if we observe it scheduled anywhere,
|
|
+ * that is good enough.
|
|
+ */
|
|
+ smp_rmb();
|
|
+ if ( !tasklet_is_scheduled(t) )
|
|
+ tasklet_schedule(t);
|
|
+#else
|
|
+ printk(XENLOG_G_ERR
|
|
+ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Crashing\n",
|
|
+ d->domain_id, level, pte);
|
|
+ domain_crash(d);
|
|
+#endif
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline bool pv_l1tf_check_l1e(struct domain *d, l1_pgentry_t l1e)
|
|
+{
|
|
+ return pv_l1tf_check_pte(d, 1, l1e.l1);
|
|
+}
|
|
+
|
|
+static inline bool pv_l1tf_check_l2e(struct domain *d, l2_pgentry_t l2e)
|
|
+{
|
|
+ return pv_l1tf_check_pte(d, 2, l2e.l2);
|
|
+}
|
|
+
|
|
+static inline bool pv_l1tf_check_l3e(struct domain *d, l3_pgentry_t l3e)
|
|
+{
|
|
+ return pv_l1tf_check_pte(d, 3, l3e.l3);
|
|
+}
|
|
+
|
|
+static inline bool pv_l1tf_check_l4e(struct domain *d, l4_pgentry_t l4e)
|
|
+{
|
|
+ return pv_l1tf_check_pte(d, 4, l4e.l4);
|
|
+}
|
|
+
|
|
+void pv_l1tf_tasklet(unsigned long data);
|
|
+
|
|
+static inline void pv_l1tf_domain_init(struct domain *d)
|
|
+{
|
|
+ d->arch.pv_domain.check_l1tf =
|
|
+ opt_pv_l1tf & (is_hardware_domain(d)
|
|
+ ? OPT_PV_L1TF_DOM0 : OPT_PV_L1TF_DOMU);
|
|
+
|
|
+#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
|
|
+ tasklet_init(&d->arch.paging.shadow.pv_l1tf_tasklet,
|
|
+ pv_l1tf_tasklet, (unsigned long)d);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline void pv_l1tf_domain_destroy(struct domain *d)
|
|
+{
|
|
+#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
|
|
+ tasklet_kill(&d->arch.paging.shadow.pv_l1tf_tasklet);
|
|
+#endif
|
|
+}
|
|
+
|
|
/* Remove all shadows of the guest mfn. */
|
|
static inline void shadow_remove_all_shadows(struct domain *d, mfn_t gmfn)
|
|
{
|
|
diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h
|
|
index 4e5f673fec..09c55458df 100644
|
|
--- a/xen/include/asm-x86/smp.h
|
|
+++ b/xen/include/asm-x86/smp.h
|
|
@@ -26,6 +26,8 @@ DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
|
|
DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
|
|
DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
|
|
|
|
+extern bool park_offline_cpus;
|
|
+
|
|
void smp_send_nmi_allbutself(void);
|
|
|
|
void send_IPI_mask(const cpumask_t *, int vector);
|
|
diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
|
|
index 5b40afbab0..8f8aad40bb 100644
|
|
--- a/xen/include/asm-x86/spec_ctrl.h
|
|
+++ b/xen/include/asm-x86/spec_ctrl.h
|
|
@@ -29,15 +29,27 @@ void init_speculation_mitigations(void);
|
|
extern bool opt_ibpb;
|
|
extern bool opt_ssbd;
|
|
extern int8_t opt_eager_fpu;
|
|
+extern int8_t opt_l1d_flush;
|
|
|
|
extern bool bsp_delay_spec_ctrl;
|
|
extern uint8_t default_xen_spec_ctrl;
|
|
extern uint8_t default_spec_ctrl_flags;
|
|
|
|
-extern uint8_t opt_xpti;
|
|
+extern int8_t opt_xpti;
|
|
#define OPT_XPTI_DOM0 0x01
|
|
#define OPT_XPTI_DOMU 0x02
|
|
|
|
+extern int8_t opt_pv_l1tf;
|
|
+#define OPT_PV_L1TF_DOM0 0x01
|
|
+#define OPT_PV_L1TF_DOMU 0x02
|
|
+
|
|
+/*
|
|
+ * The L1D address mask, which might be wider than reported in CPUID, and the
|
|
+ * system physical address above which there are believed to be no cacheable
|
|
+ * memory regions, thus unable to leak data via the L1TF vulnerability.
|
|
+ */
|
|
+extern paddr_t l1tf_addr_mask, l1tf_safe_maddr;
|
|
+
|
|
static inline void init_shadow_spec_ctrl_state(void)
|
|
{
|
|
struct cpu_info *info = get_cpu_info();
|
|
diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h
|
|
index 43fb6fe489..483cd20afd 100644
|
|
--- a/xen/include/asm-x86/system.h
|
|
+++ b/xen/include/asm-x86/system.h
|
|
@@ -221,6 +221,30 @@ static always_inline unsigned long __xadd(
|
|
#define set_mb(var, value) do { xchg(&var, value); } while (0)
|
|
#define set_wmb(var, value) do { var = value; smp_wmb(); } while (0)
|
|
|
|
+/**
|
|
+ * array_index_mask_nospec() - generate a mask that is ~0UL when the
|
|
+ * bounds check succeeds and 0 otherwise
|
|
+ * @index: array element index
|
|
+ * @size: number of elements in array
|
|
+ *
|
|
+ * Returns:
|
|
+ * 0 - (index < size)
|
|
+ */
|
|
+static inline unsigned long array_index_mask_nospec(unsigned long index,
|
|
+ unsigned long size)
|
|
+{
|
|
+ unsigned long mask;
|
|
+
|
|
+ asm volatile ( "cmp %[size], %[index]; sbb %[mask], %[mask];"
|
|
+ : [mask] "=r" (mask)
|
|
+ : [size] "g" (size), [index] "r" (index) );
|
|
+
|
|
+ return mask;
|
|
+}
|
|
+
|
|
+/* Override default implementation in nospec.h. */
|
|
+#define array_index_mask_nospec array_index_mask_nospec
|
|
+
|
|
#define local_irq_disable() asm volatile ( "cli" : : : "memory" )
|
|
#define local_irq_enable() asm volatile ( "sti" : : : "memory" )
|
|
|
|
diff --git a/xen/include/asm-x86/xstate.h b/xen/include/asm-x86/xstate.h
|
|
index 86a4a1f75c..47f602b855 100644
|
|
--- a/xen/include/asm-x86/xstate.h
|
|
+++ b/xen/include/asm-x86/xstate.h
|
|
@@ -97,8 +97,9 @@ void xsave(struct vcpu *v, uint64_t mask);
|
|
void xrstor(struct vcpu *v, uint64_t mask);
|
|
void xstate_set_init(uint64_t mask);
|
|
bool xsave_enabled(const struct vcpu *v);
|
|
-int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum,
|
|
- const struct xsave_hdr *);
|
|
+int __must_check validate_xstate(const struct domain *d,
|
|
+ uint64_t xcr0, uint64_t xcr0_accum,
|
|
+ const struct xsave_hdr *hdr);
|
|
int __must_check handle_xsetbv(u32 index, u64 new_bv);
|
|
void expand_xsave_states(struct vcpu *v, void *dest, unsigned int size);
|
|
void compress_xsave_states(struct vcpu *v, const void *src, unsigned int size);
|
|
diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
|
|
index f1a5ed93e0..6c82816fd3 100644
|
|
--- a/xen/include/public/arch-x86/cpufeatureset.h
|
|
+++ b/xen/include/public/arch-x86/cpufeatureset.h
|
|
@@ -244,6 +244,7 @@ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions *
|
|
XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single Precision */
|
|
XEN_CPUFEATURE(IBRSB, 9*32+26) /*A IBRS and IBPB support (used by Intel) */
|
|
XEN_CPUFEATURE(STIBP, 9*32+27) /*A STIBP */
|
|
+XEN_CPUFEATURE(L1D_FLUSH, 9*32+28) /*S MSR_FLUSH_CMD and L1D flush. */
|
|
XEN_CPUFEATURE(ARCH_CAPS, 9*32+29) /* IA32_ARCH_CAPABILITIES MSR */
|
|
XEN_CPUFEATURE(SSBD, 9*32+31) /*A MSR_SPEC_CTRL.SSBD available */
|
|
|
|
diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h
|
|
index 533a8ea0f3..a7e05681c9 100644
|
|
--- a/xen/include/xen/compiler.h
|
|
+++ b/xen/include/xen/compiler.h
|
|
@@ -81,6 +81,9 @@
|
|
#pragma GCC visibility push(hidden)
|
|
#endif
|
|
|
|
+/* Make the optimizer believe the variable can be manipulated arbitrarily. */
|
|
+#define OPTIMIZER_HIDE_VAR(var) __asm__ ( "" : "+g" (var) )
|
|
+
|
|
/* This macro obfuscates arithmetic on a variable address so that gcc
|
|
shouldn't recognize the original var, and make assumptions about it */
|
|
/*
|
|
diff --git a/xen/include/xen/cpu.h b/xen/include/xen/cpu.h
|
|
index ffefc09f8e..2fe3ec05d8 100644
|
|
--- a/xen/include/xen/cpu.h
|
|
+++ b/xen/include/xen/cpu.h
|
|
@@ -47,6 +47,8 @@ void register_cpu_notifier(struct notifier_block *nb);
|
|
#define CPU_DYING (0x0007 | NOTIFY_REVERSE)
|
|
/* CPU_DEAD: CPU is dead. */
|
|
#define CPU_DEAD (0x0008 | NOTIFY_REVERSE)
|
|
+/* CPU_REMOVE: CPU was removed. */
|
|
+#define CPU_REMOVE (0x0009 | NOTIFY_REVERSE)
|
|
|
|
/* Perform CPU hotplug. May return -EAGAIN. */
|
|
int cpu_down(unsigned int cpu);
|
|
diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h
|
|
index 42340a098e..4a11bcc3f3 100644
|
|
--- a/xen/include/xen/cpumask.h
|
|
+++ b/xen/include/xen/cpumask.h
|
|
@@ -351,16 +351,35 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
|
|
return *mask != NULL;
|
|
}
|
|
|
|
+static inline bool cond_alloc_cpumask_var(cpumask_var_t *mask)
|
|
+{
|
|
+ if (*mask == NULL)
|
|
+ *mask = _xmalloc(nr_cpumask_bits / 8, sizeof(long));
|
|
+ return *mask != NULL;
|
|
+}
|
|
+
|
|
static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
|
|
{
|
|
*(void **)mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
|
|
return *mask != NULL;
|
|
}
|
|
|
|
+static inline bool cond_zalloc_cpumask_var(cpumask_var_t *mask)
|
|
+{
|
|
+ if (*mask == NULL)
|
|
+ *mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
|
|
+ else
|
|
+ cpumask_clear(*mask);
|
|
+ return *mask != NULL;
|
|
+}
|
|
+
|
|
static inline void free_cpumask_var(cpumask_var_t mask)
|
|
{
|
|
xfree(mask);
|
|
}
|
|
+
|
|
+/* Free an allocated mask, and zero the pointer to it. */
|
|
+#define FREE_CPUMASK_VAR(m) XFREE(m)
|
|
#else
|
|
typedef cpumask_t cpumask_var_t[1];
|
|
|
|
@@ -368,16 +387,20 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
|
|
{
|
|
return 1;
|
|
}
|
|
+#define cond_alloc_cpumask_var alloc_cpumask_var
|
|
|
|
static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
|
|
{
|
|
cpumask_clear(*mask);
|
|
return 1;
|
|
}
|
|
+#define cond_zalloc_cpumask_var zalloc_cpumask_var
|
|
|
|
static inline void free_cpumask_var(cpumask_var_t mask)
|
|
{
|
|
}
|
|
+
|
|
+#define FREE_CPUMASK_VAR(m) free_cpumask_var(m)
|
|
#endif
|
|
|
|
#if NR_CPUS > 1
|
|
diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h
|
|
index fa07d720ee..1387abb211 100644
|
|
--- a/xen/include/xen/list.h
|
|
+++ b/xen/include/xen/list.h
|
|
@@ -51,6 +51,11 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
|
|
list->prev = list;
|
|
}
|
|
|
|
+static inline bool list_head_is_null(const struct list_head *list)
|
|
+{
|
|
+ return !list->next && !list->prev;
|
|
+}
|
|
+
|
|
/*
|
|
* Insert a new entry between two known consecutive entries.
|
|
*
|
|
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
|
|
index e928551c91..24654e8e22 100644
|
|
--- a/xen/include/xen/mm.h
|
|
+++ b/xen/include/xen/mm.h
|
|
@@ -162,6 +162,14 @@ void free_xenheap_pages(void *v, unsigned int order);
|
|
bool scrub_free_pages(void);
|
|
#define alloc_xenheap_page() (alloc_xenheap_pages(0,0))
|
|
#define free_xenheap_page(v) (free_xenheap_pages(v,0))
|
|
+
|
|
+/* Free an allocation, and zero the pointer to it. */
|
|
+#define FREE_XENHEAP_PAGES(p, o) do { \
|
|
+ free_xenheap_pages(p, o); \
|
|
+ (p) = NULL; \
|
|
+} while ( false )
|
|
+#define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0)
|
|
+
|
|
/* Map machine page range in Xen virtual address space. */
|
|
int map_pages_to_xen(
|
|
unsigned long virt,
|
|
diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h
|
|
new file mode 100644
|
|
index 0000000000..48793996e8
|
|
--- /dev/null
|
|
+++ b/xen/include/xen/nospec.h
|
|
@@ -0,0 +1,70 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+/* Copyright(c) 2018 Linus Torvalds. All rights reserved. */
|
|
+/* Copyright(c) 2018 Alexei Starovoitov. All rights reserved. */
|
|
+/* Copyright(c) 2018 Intel Corporation. All rights reserved. */
|
|
+/* Copyright(c) 2018 Citrix Systems R&D Ltd. All rights reserved. */
|
|
+
|
|
+#ifndef XEN_NOSPEC_H
|
|
+#define XEN_NOSPEC_H
|
|
+
|
|
+#include <asm/system.h>
|
|
+
|
|
+/**
|
|
+ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
|
|
+ * @index: array element index
|
|
+ * @size: number of elements in array
|
|
+ *
|
|
+ * When @index is out of bounds (@index >= @size), the sign bit will be
|
|
+ * set. Extend the sign bit to all bits and invert, giving a result of
|
|
+ * zero for an out of bounds index, or ~0 if within bounds [0, @size).
|
|
+ */
|
|
+#ifndef array_index_mask_nospec
|
|
+static inline unsigned long array_index_mask_nospec(unsigned long index,
|
|
+ unsigned long size)
|
|
+{
|
|
+ /*
|
|
+ * Always calculate and emit the mask even if the compiler
|
|
+ * thinks the mask is not needed. The compiler does not take
|
|
+ * into account the value of @index under speculation.
|
|
+ */
|
|
+ OPTIMIZER_HIDE_VAR(index);
|
|
+ return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
|
|
+}
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * array_index_nospec - sanitize an array index after a bounds check
|
|
+ *
|
|
+ * For a code sequence like:
|
|
+ *
|
|
+ * if (index < size) {
|
|
+ * index = array_index_nospec(index, size);
|
|
+ * val = array[index];
|
|
+ * }
|
|
+ *
|
|
+ * ...if the CPU speculates past the bounds check then
|
|
+ * array_index_nospec() will clamp the index within the range of [0,
|
|
+ * size).
|
|
+ */
|
|
+#define array_index_nospec(index, size) \
|
|
+({ \
|
|
+ typeof(index) _i = (index); \
|
|
+ typeof(size) _s = (size); \
|
|
+ unsigned long _mask = array_index_mask_nospec(_i, _s); \
|
|
+ \
|
|
+ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \
|
|
+ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \
|
|
+ \
|
|
+ (typeof(_i)) (_i & _mask); \
|
|
+})
|
|
+
|
|
+#endif /* XEN_NOSPEC_H */
|
|
+
|
|
+/*
|
|
+ * Local variables:
|
|
+ * mode: C
|
|
+ * c-file-style: "BSD"
|
|
+ * c-basic-offset: 4
|
|
+ * indent-tabs-mode: nil
|
|
+ * End:
|
|
+ */
|
|
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
|
|
index 99d2af2e1f..e79d5a36ca 100644
|
|
--- a/xen/include/xen/sched.h
|
|
+++ b/xen/include/xen/sched.h
|
|
@@ -788,7 +788,7 @@ static inline struct domain *next_domain_in_cpupool(
|
|
#define _VPF_parked 8
|
|
#define VPF_parked (1UL<<_VPF_parked)
|
|
|
|
-static inline int vcpu_runnable(struct vcpu *v)
|
|
+static inline bool vcpu_runnable(const struct vcpu *v)
|
|
{
|
|
return !(v->pause_flags |
|
|
atomic_read(&v->pause_count) |
|
|
diff --git a/xen/include/xen/tasklet.h b/xen/include/xen/tasklet.h
|
|
index 23d69c738e..bc9ddace6d 100644
|
|
--- a/xen/include/xen/tasklet.h
|
|
+++ b/xen/include/xen/tasklet.h
|
|
@@ -50,6 +50,11 @@ static inline bool tasklet_work_to_do(unsigned int cpu)
|
|
TASKLET_scheduled);
|
|
}
|
|
|
|
+static inline bool tasklet_is_scheduled(const struct tasklet *t)
|
|
+{
|
|
+ return t->scheduled_on != -1;
|
|
+}
|
|
+
|
|
void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu);
|
|
void tasklet_schedule(struct tasklet *t);
|
|
void do_tasklet(void);
|
|
diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h
|
|
index cc2673d8ae..9aa5edf593 100644
|
|
--- a/xen/include/xen/xmalloc.h
|
|
+++ b/xen/include/xen/xmalloc.h
|
|
@@ -26,6 +26,12 @@
|
|
/* Free any of the above. */
|
|
extern void xfree(void *);
|
|
|
|
+/* Free an allocation, and zero the pointer to it. */
|
|
+#define XFREE(p) do { \
|
|
+ xfree(p); \
|
|
+ (p) = NULL; \
|
|
+} while ( false )
|
|
+
|
|
/* Underlying functions */
|
|
extern void *_xmalloc(unsigned long size, unsigned long align);
|
|
extern void *_xzalloc(unsigned long size, unsigned long align);
|