SlackBuildsOrg/system/xen/xsa/xsa273-d757c29ffe2e31b15397e43cd58da88b6318b654.patch

diff --git a/docs/man/xl.conf.pod.5 b/docs/man/xl.conf.pod.5
index da91b8626c..37262a7ef8 100644
--- a/docs/man/xl.conf.pod.5
+++ b/docs/man/xl.conf.pod.5
@@ -185,6 +185,28 @@ massively huge guests).

 =back

+=item B<vm.cpumask>="CPULIST"
+
+=item B<vm.hvm.cpumask>="CPULIST"
+
+=item B<vm.pv.cpumask>="CPULIST"
+
+Global masks that are applied when creating guests and pinning vcpus
+to indicate which cpus they are allowed to run on.  Specifically,
+C<vm.cpumask> applies to all guest types, C<vm.hvm.cpumask> applies to
+both HVM and PVH guests and C<vm.pv.cpumask> applies to PV guests.
+
+The hard affinity of guest's vcpus are logical-AND'ed with respective
+masks. If the resulting affinity mask is empty, operation will fail.
+
+Use --ignore-global-affinity-masks to skip applying global masks.
+
+The default value for these masks are all 1's, i.e. all cpus are allowed.
+
+Due to bug(s), these options may not interact well with other options
+concerning CPU affinity. One example is CPU pools. Users should always double
+check that the required affinity has taken effect.
+
 =back

 =head1 SEE ALSO
diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 075e5ea159..0886706368 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -489,10 +489,10 @@ accounting for hardware capabilities as enumerated via CPUID.

 Currently accepted:

-The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`, `ssbd` are
-used by default if available and applicable.  They can be ignored,
-e.g. `no-ibrsb`, at which point Xen won't use them itself, and won't offer
-them to guests.
+The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`,
+`l1d-flush` and `ssbd` are used by default if available and applicable.  They can
+be ignored, e.g. `no-ibrsb`, at which point Xen won't use them itself, and
+won't offer them to guests.

 ### cpuid\_mask\_cpu (AMD only)
 > `= fam_0f_rev_c | fam_0f_rev_d | fam_0f_rev_e | fam_0f_rev_f | fam_0f_rev_g | fam_10_rev_b | fam_10_rev_c | fam_11_rev_b`
@@ -936,6 +936,8 @@ version are 1 and 2.
 use of grant table v2 without transitive grants is an ABI breakage from the
 guests point of view.

+The usage of gnttab v2 is not security supported on ARM platforms.
+
 ### gnttab\_max\_frames
 > `= <integer>`

@@ -1544,6 +1546,30 @@ do; there may be other custom operating systems which do.  If you're
 certain you don't plan on having PV guests which use this feature,
 turning it off can reduce the attack surface.

+### pv-l1tf (x86)
+> `= List of [ <bool>, dom0=<bool>, domu=<bool> ]`
+
+> Default: `false` on believed-unaffected hardware, or in pv-shim mode.
+>          `domu`  on believed-affected hardware.
+
+Mitigations for L1TF / XSA-273 / CVE-2018-3620 for PV guests.
+
+For backwards compatibility, we may not alter an architecturally-legitimate
+pagetable entry a PV guest chooses to write.  We can however force such a
+guest into shadow mode so that Xen controls the PTEs which are reachable by
+the CPU pagewalk.
+
+Shadowing is performed at the point where a PV guest first tries to write an
+L1TF-vulnerable PTE.  Therefore, a PV guest kernel which has been updated with
+its own L1TF mitigations will not trigger shadow mode if it is well behaved.
+
+If CONFIG\_SHADOW\_PAGING is not compiled in, this mitigation instead crashes
+the guest when an L1TF-vulnerable PTE is written, which still allows updated,
+well-behaved PV guests to run, despite Shadow being compiled out.
+
+In the pv-shim case, Shadow is expected to be compiled out, and a malicious
+guest kernel can only leak data from the shim Xen, rather than the host Xen.
+
 ### pv-shim (x86)
 > `= <boolean>`

@@ -1748,6 +1774,13 @@ Use `smap=hvm` to allow SMAP use by HVM guests only.
 Flag to enable Supervisor Mode Execution Protection
 Use `smep=hvm` to allow SMEP use by HVM guests only.

+### smt (x86)
+> `= <boolean>`
+
+Default: `true`
+
+Control bring up of multiple hyper-threads per CPU core.
+
 ### snb\_igd\_quirk
 > `= <boolean> | cap | <integer>`

@@ -1758,7 +1791,8 @@ false disable the quirk workaround, which is also the default.

 ### spec-ctrl (x86)
 > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb}=<bool>,
->              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu}=<bool> ]`
+>              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
+>              l1d-flush}=<bool> ]`

 Controls for speculative execution sidechannel mitigations.  By default, Xen
 will pick the most appropriate mitigations based on compiled in support,
@@ -1770,10 +1804,15 @@ extreme care.**

 An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
 mitigations, including pieces of infrastructure used to virtualise certain
-mitigation features for guests.  Alternatively, a slightly more restricted
-`spec-ctrl=no-xen` can be used to turn off all of Xen's mitigations, while
-leaving the virtualisation support in place for guests to use.  Use of a
-positive boolean value for either of these options is invalid.
+mitigation features for guests.  This also includes settings which `xpti`,
+`smt`, `pv-l1tf` control, unless the respective option(s) have been
+specified earlier on the command line.
+
+Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
+turn off all of Xen's mitigations, while leaving the virtualisation support
+in place for guests to use.
+
+Use of a positive boolean value for either of these options is invalid.

 The booleans `pv=`, `hvm=`, `msr-sc=` and `rsb=` offer fine grained control
 over the alternative blocks used by Xen.  These impact Xen's ability to
@@ -1813,6 +1852,12 @@ from using fully eager FPU context switches.  This is currently implemented as
 a global control.  By default, Xen will choose to use fully eager context
 switches on hardware believed to speculate past #NM exceptions.

+On hardware supporting L1D_FLUSH, the `l1d-flush=` option can be used to force
+or prevent Xen from issuing an L1 data cache flush on each VMEntry.
+Irrespective of Xen's setting, the feature is virtualised for HVM guests to
+use.  By default, Xen will enable this mitigation on hardware believed to be
+vulnerable to L1TF.
+
 ### sync\_console
 > `= <boolean>`

diff --git a/tools/examples/xl.conf b/tools/examples/xl.conf
index 374b6bbc2e..0446deb304 100644
--- a/tools/examples/xl.conf
+++ b/tools/examples/xl.conf
@@ -37,3 +37,8 @@
 # (which can take a long time to find out if launching huge guests).
 # see xl.conf(5) for details.
 #claim_mode=1
+
+# Specify global vcpu hard affinity masks. See xl.conf(5) for details.
+#vm.cpumask="0-7"
+#vm.pv.cpumask="0-3"
+#vm.hvm.cpumask="3-7"
diff --git a/tools/libxl/libxl_cpuid.c b/tools/libxl/libxl_cpuid.c
index 7b0f594c3d..52e16c20ed 100644
--- a/tools/libxl/libxl_cpuid.c
+++ b/tools/libxl/libxl_cpuid.c
@@ -204,6 +204,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str)
         {"avx512-4fmaps",0x00000007,  0, CPUID_REG_EDX,  3,  1},
         {"ibrsb",        0x00000007,  0, CPUID_REG_EDX, 26,  1},
         {"stibp",        0x00000007,  0, CPUID_REG_EDX, 27,  1},
+        {"l1d-flush",    0x00000007,  0, CPUID_REG_EDX, 28,  1},
         {"arch-caps",    0x00000007,  0, CPUID_REG_EDX, 29,  1},
         {"ssbd",         0x00000007,  0, CPUID_REG_EDX, 31,  1},

diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c
index e116339733..3888b4e158 100644
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -143,7 +143,7 @@ static const char *str_7d0[32] =
     [ 2] = "avx512_4vnniw", [ 3] = "avx512_4fmaps",

     [26] = "ibrsb",         [27] = "stibp",
-    /* 28 */                [29] = "arch_caps",
+    [28] = "l1d_flush",     [29] = "arch_caps",
     /* 30 */                [31] = "ssbd",
 };

diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml
index 13cf3b5bf4..5a8c377603 100644
--- a/tools/ocaml/xenstored/store.ml
+++ b/tools/ocaml/xenstored/store.ml
@@ -262,7 +262,8 @@ let path_write store perm path value =
 		Node.check_perm store.root perm Perms.WRITE;
 		Node.set_value store.root value, false
 	) else
-		Path.apply_modify store.root path do_write, !node_created
+		let root = Path.apply_modify store.root path do_write in
+		root, !node_created

 let path_rm store perm path =
 	let do_rm node name =
diff --git a/tools/xl/xl.c b/tools/xl/xl.c
index 179908b4f6..7d2142f16f 100644
--- a/tools/xl/xl.c
+++ b/tools/xl/xl.c
@@ -28,6 +28,9 @@
 #include <libxl_utils.h>
 #include <libxlutil.h>
 #include "xl.h"
+#include "xl_parse.h"
+
+#include "xl_utils.h"

 xentoollog_logger_stdiostream *logger;
 int dryrun_only;
@@ -42,6 +45,9 @@ char *default_gatewaydev = NULL;
 char *default_vifbackend = NULL;
 char *default_remus_netbufscript = NULL;
 char *default_colo_proxy_script = NULL;
+libxl_bitmap global_vm_affinity_mask;
+libxl_bitmap global_hvm_affinity_mask;
+libxl_bitmap global_pv_affinity_mask;
 enum output_format default_output_format = OUTPUT_FORMAT_JSON;
 int claim_mode = 1;
 bool progress_use_cr = 0;
@@ -203,6 +209,26 @@ static void parse_global_config(const char *configfile,
     if (!xlu_cfg_get_long (config, "max_maptrack_frames", &l, 0))
         max_maptrack_frames = l;

+    libxl_bitmap_init(&global_vm_affinity_mask);
+    libxl_cpu_bitmap_alloc(ctx, &global_vm_affinity_mask, 0);
+    libxl_bitmap_init(&global_hvm_affinity_mask);
+    libxl_cpu_bitmap_alloc(ctx, &global_hvm_affinity_mask, 0);
+    libxl_bitmap_init(&global_pv_affinity_mask);
+    libxl_cpu_bitmap_alloc(ctx, &global_pv_affinity_mask, 0);
+
+    if (!xlu_cfg_get_string (config, "vm.cpumask", &buf, 0))
+        parse_cpurange(buf, &global_vm_affinity_mask);
+    else
+        libxl_bitmap_set_any(&global_vm_affinity_mask);
+    if (!xlu_cfg_get_string (config, "vm.hvm.cpumask", &buf, 0))
+        parse_cpurange(buf, &global_hvm_affinity_mask);
+    else
+       libxl_bitmap_set_any(&global_hvm_affinity_mask);
+    if (!xlu_cfg_get_string (config, "vm.pv.cpumask", &buf, 0))
+        parse_cpurange(buf, &global_pv_affinity_mask);
+    else
+        libxl_bitmap_set_any(&global_pv_affinity_mask);
+
     xlu_cfg_destroy(config);
 }

diff --git a/tools/xl/xl.h b/tools/xl/xl.h
index 4e784ff402..7e97144b50 100644
--- a/tools/xl/xl.h
+++ b/tools/xl/xl.h
@@ -41,6 +41,7 @@ struct domain_create {
     int vncautopass;
     int console_autoconnect;
     int checkpointed_stream;
+    int ignore_global_affinity_masks;
     const char *config_file;
     char *extra_config; /* extra config string */
     const char *restore_file;
@@ -279,6 +280,9 @@ extern char *default_colo_proxy_script;
 extern char *blkdev_start;
 extern int max_grant_frames;
 extern int max_maptrack_frames;
+extern libxl_bitmap global_vm_affinity_mask;
+extern libxl_bitmap global_hvm_affinity_mask;
+extern libxl_bitmap global_pv_affinity_mask;

 enum output_format {
     OUTPUT_FORMAT_JSON,
@@ -294,6 +298,9 @@ typedef enum {
 } domain_restart_type;

 extern void printf_info_sexp(int domid, libxl_domain_config *d_config, FILE *fh);
+extern void apply_global_affinity_masks(libxl_domain_type type,
+                                        libxl_bitmap *vcpu_affinity_array,
+                                        unsigned int size);

 #define XL_GLOBAL_CONFIG XEN_CONFIG_DIR "/xl.conf"
 #define XL_LOCK_FILE XEN_LOCK_DIR "/xl"
diff --git a/tools/xl/xl_cmdtable.c b/tools/xl/xl_cmdtable.c
index bf2ced8140..54c2db6022 100644
--- a/tools/xl/xl_cmdtable.c
+++ b/tools/xl/xl_cmdtable.c
@@ -34,7 +34,8 @@ struct cmd_spec cmd_table[] = {
       "-e                      Do not wait in the background for the death of the domain.\n"
       "-V, --vncviewer         Connect to the VNC display after the domain is created.\n"
       "-A, --vncviewer-autopass\n"
-      "                        Pass VNC password to viewer via stdin."
+      "                        Pass VNC password to viewer via stdin.\n"
+      "--ignore-global-affinity-masks Ignore global masks in xl.conf."
     },
     { "config-update",
       &main_config_update, 1, 1,
@@ -224,7 +225,8 @@ struct cmd_spec cmd_table[] = {
       &main_vcpupin, 1, 1,
       "Set which CPUs a VCPU can use",
       "[option] <Domain> <VCPU|all> <Hard affinity|-|all> <Soft affinity|-|all>",
-      "-f, --force        undo an override pinning done by the kernel",
+      "-f, --force        undo an override pinning done by the kernel\n"
+      "--ignore-global-affinity-masks Ignore global masks in xl.conf",
     },
     { "vcpu-set",
       &main_vcpuset, 0, 1,
diff --git a/tools/xl/xl_vcpu.c b/tools/xl/xl_vcpu.c
index 8e735b38c1..3384eeed06 100644
--- a/tools/xl/xl_vcpu.c
+++ b/tools/xl/xl_vcpu.c
@@ -68,6 +68,61 @@ static void print_domain_vcpuinfo(uint32_t domid, uint32_t nr_cpus)
     libxl_vcpuinfo_list_free(vcpuinfo, nb_vcpu);
 }

+void apply_global_affinity_masks(libxl_domain_type type,
+                                 libxl_bitmap *vcpu_affinity_array,
+                                 unsigned int size)
+{
+    libxl_bitmap *mask = &global_vm_affinity_mask;
+    libxl_bitmap *type_mask;
+    unsigned int i;
+
+    switch (type) {
+    case LIBXL_DOMAIN_TYPE_HVM:
+    case LIBXL_DOMAIN_TYPE_PVH:
+        type_mask = &global_hvm_affinity_mask;
+        break;
+    case LIBXL_DOMAIN_TYPE_PV:
+        type_mask = &global_pv_affinity_mask;
+        break;
+    default:
+        fprintf(stderr, "Unknown guest type\n");
+        exit(EXIT_FAILURE);
+    }
+
+    for (i = 0; i < size; i++) {
+        int rc;
+        libxl_bitmap *t = &vcpu_affinity_array[i];
+        libxl_bitmap b1, b2;
+
+        libxl_bitmap_init(&b1);
+        libxl_bitmap_init(&b2);
+
+        rc = libxl_bitmap_and(ctx, &b1, t, mask);
+        if (rc) {
+            fprintf(stderr, "libxl_bitmap_and errored\n");
+            exit(EXIT_FAILURE);
+        }
+        rc = libxl_bitmap_and(ctx, &b2, &b1, type_mask);
+        if (rc) {
+            fprintf(stderr, "libxl_bitmap_and errored\n");
+            exit(EXIT_FAILURE);
+        }
+
+        if (libxl_bitmap_is_empty(&b2)) {
+            fprintf(stderr, "vcpu hard affinity map is empty\n");
+            exit(EXIT_FAILURE);
+        }
+
+        /* Replace target bitmap with the result */
+        libxl_bitmap_dispose(t);
+        libxl_bitmap_init(t);
+        libxl_bitmap_copy_alloc(ctx, t, &b2);
+
+        libxl_bitmap_dispose(&b1);
+        libxl_bitmap_dispose(&b2);
+    }
+}
+
 static void vcpulist(int argc, char **argv)
 {
     libxl_dominfo *dominfo;
@@ -118,6 +173,7 @@ int main_vcpupin(int argc, char **argv)
 {
     static struct option opts[] = {
         {"force", 0, 0, 'f'},
+        {"ignore-global-affinity-masks", 0, 0, 'i'},
         COMMON_LONG_OPTS
     };
     libxl_vcpuinfo *vcpuinfo;
@@ -132,15 +188,18 @@ int main_vcpupin(int argc, char **argv)
     const char *vcpu, *hard_str, *soft_str;
     char *endptr;
     int opt, nb_cpu, nb_vcpu, rc = EXIT_FAILURE;
-    bool force = false;
+    bool force = false, ignore_masks = false;

     libxl_bitmap_init(&cpumap_hard);
     libxl_bitmap_init(&cpumap_soft);

-    SWITCH_FOREACH_OPT(opt, "f", opts, "vcpu-pin", 3) {
+    SWITCH_FOREACH_OPT(opt, "fi", opts, "vcpu-pin", 3) {
     case 'f':
         force = true;
         break;
+    case 'i':
+        ignore_masks = true;
+        break;
     default:
         break;
     }
@@ -222,6 +281,23 @@ int main_vcpupin(int argc, char **argv)
         goto out;
     }

+    /* Only hard affinity matters here */
+    if (!ignore_masks) {
+        libxl_domain_config d_config;
+
+        libxl_domain_config_init(&d_config);
+        rc = libxl_retrieve_domain_configuration(ctx, domid, &d_config);
+        if (rc) {
+            fprintf(stderr, "Could not retrieve domain configuration\n");
+            libxl_domain_config_dispose(&d_config);
+            goto out;
+        }
+
+        apply_global_affinity_masks(d_config.b_info.type, hard, 1);
+
+        libxl_domain_config_dispose(&d_config);
+    }
+
     if (force) {
         if (libxl_set_vcpuaffinity_force(ctx, domid, vcpuid, hard, soft)) {
             fprintf(stderr, "Could not set affinity for vcpu `%ld'.\n",
diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c
index 89c2b25ded..a1d633795c 100644
--- a/tools/xl/xl_vmcontrol.c
+++ b/tools/xl/xl_vmcontrol.c
@@ -804,6 +804,36 @@ int create_domain(struct domain_create *dom_info)
         parse_config_data(config_source, config_data, config_len, &d_config);
     }

+    if (!dom_info->ignore_global_affinity_masks) {
+        libxl_domain_build_info *b_info = &d_config.b_info;
+
+        /* It is possible that no hard affinity is specified in config file.
+         * Generate hard affinity maps now if we care about those.
+         */
+        if (b_info->num_vcpu_hard_affinity == 0 &&
+              (!libxl_bitmap_is_full(&global_vm_affinity_mask) ||
+                 (b_info->type == LIBXL_DOMAIN_TYPE_PV &&
+                  !libxl_bitmap_is_full(&global_pv_affinity_mask)) ||
+                 (b_info->type != LIBXL_DOMAIN_TYPE_PV &&
+                  !libxl_bitmap_is_full(&global_hvm_affinity_mask))
+               )) {
+            b_info->num_vcpu_hard_affinity = b_info->max_vcpus;
+            b_info->vcpu_hard_affinity =
+                xmalloc(b_info->max_vcpus * sizeof(libxl_bitmap));
+
+            for (i = 0; i < b_info->num_vcpu_hard_affinity; i++) {
+                libxl_bitmap *m = &b_info->vcpu_hard_affinity[i];
+                libxl_bitmap_init(m);
+                libxl_cpu_bitmap_alloc(ctx, m, 0);
+                libxl_bitmap_set_any(m);
+            }
+        }
+
+        apply_global_affinity_masks(b_info->type,
+                                    b_info->vcpu_hard_affinity,
+                                    b_info->num_vcpu_hard_affinity);
+    }
+
     if (migrate_fd >= 0) {
         if (d_config.c_info.name) {
             /* when we receive a domain we get its name from the config
@@ -1124,7 +1154,7 @@ int main_create(int argc, char **argv)
     const char *filename = NULL;
     struct domain_create dom_info;
     int paused = 0, debug = 0, daemonize = 1, console_autoconnect = 0,
-        quiet = 0, monitor = 1, vnc = 0, vncautopass = 0;
+        quiet = 0, monitor = 1, vnc = 0, vncautopass = 0, ignore_masks = 0;
     int opt, rc;
     static struct option opts[] = {
         {"dryrun", 0, 0, 'n'},
@@ -1132,6 +1162,7 @@ int main_create(int argc, char **argv)
         {"defconfig", 1, 0, 'f'},
         {"vncviewer", 0, 0, 'V'},
         {"vncviewer-autopass", 0, 0, 'A'},
+        {"ignore-global-affinity-masks", 0, 0, 'i'},
         COMMON_LONG_OPTS
     };

@@ -1142,7 +1173,7 @@ int main_create(int argc, char **argv)
         argc--; argv++;
     }

-    SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVA", opts, "create", 0) {
+    SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVAi", opts, "create", 0) {
     case 'f':
         filename = optarg;
         break;
@@ -1174,6 +1205,9 @@ int main_create(int argc, char **argv)
     case 'A':
         vnc = vncautopass = 1;
         break;
+    case 'i':
+        ignore_masks = 1;
+        break;
     }

     memset(&dom_info, 0, sizeof(dom_info));
@@ -1203,6 +1237,7 @@ int main_create(int argc, char **argv)
     dom_info.vnc = vnc;
     dom_info.vncautopass = vncautopass;
     dom_info.console_autoconnect = console_autoconnect;
+    dom_info.ignore_global_affinity_masks = ignore_masks;

     rc = create_domain(&dom_info);
     if (rc < 0) {
#diff --git a/xen/Makefile b/xen/Makefile
#index 4d075c381f..a922a1b7b5 100644
#--- a/xen/Makefile
#+++ b/xen/Makefile
#@@ -2,7 +2,7 @@
# # All other places this is stored (eg. compile.h) should be autogenerated.
# export XEN_VERSION       = 4
# export XEN_SUBVERSION    = 11
#-export XEN_EXTRAVERSION ?= .0$(XEN_VENDORVERSION)
#+export XEN_EXTRAVERSION ?= .1-pre$(XEN_VENDORVERSION)
# export XEN_FULLVERSION   = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
# -include xen-version
#
diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
index f64fc56739..cfba4a708c 100644
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -72,6 +72,7 @@ config SHADOW_PAGING
             * Running HVM guests on hardware lacking hardware paging support
               (First-generation Intel VT-x or AMD SVM).
             * Live migration of PV guests.
+            * L1TF sidechannel mitigation for PV guests.

           Under a small number of specific workloads, shadow paging may be
           deliberately used as a performance optimisation.
diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 458a3fe60c..76078b55b2 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -505,17 +505,23 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
                 u32 eax, ebx, ecx, edx;

                 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-                c->compute_unit_id = ebx & 0xFF;
                 c->x86_num_siblings = ((ebx >> 8) & 0x3) + 1;
+
+                if (c->x86 < 0x17)
+                        c->compute_unit_id = ebx & 0xFF;
+                else {
+                        c->cpu_core_id = ebx & 0xFF;
+                        c->x86_max_cores /= c->x86_num_siblings;
+                }
         }

         if (opt_cpu_info)
                 printk("CPU %d(%d) -> Processor %d, %s %d\n",
                        cpu, c->x86_max_cores, c->phys_proc_id,
-                       cpu_has(c, X86_FEATURE_TOPOEXT) ? "Compute Unit" :
-                                                         "Core",
-                       cpu_has(c, X86_FEATURE_TOPOEXT) ? c->compute_unit_id :
-                                                         c->cpu_core_id);
+                       c->compute_unit_id != INVALID_CUID ? "Compute Unit"
+                                                          : "Core",
+                       c->compute_unit_id != INVALID_CUID ? c->compute_unit_id
+                                                          : c->cpu_core_id);
 }

 static void early_init_amd(struct cpuinfo_x86 *c)
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 528aff1811..fdb022875a 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -14,6 +14,7 @@
 #include <public/sysctl.h> /* for XEN_INVALID_{SOCKET,CORE}_ID */

 #include "cpu.h"
+#include "mcheck/x86_mca.h"

 bool_t opt_arat = 1;
 boolean_param("arat", opt_arat);
@@ -355,6 +356,9 @@ static void __init early_cpu_detect(void)
 			hap_paddr_bits = PADDR_BITS;
 	}

+	if (c->x86_vendor != X86_VENDOR_AMD)
+		park_offline_cpus = opt_mce;
+
 	initialize_cpu_data(0);
 }

diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
index a8c287d124..32273d9208 100644
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -692,12 +692,15 @@ static void cpu_bank_free(unsigned int cpu)

     mcabanks_free(poll);
     mcabanks_free(clr);
+
+    per_cpu(poll_bankmask, cpu) = NULL;
+    per_cpu(mce_clear_banks, cpu) = NULL;
 }

 static int cpu_bank_alloc(unsigned int cpu)
 {
-    struct mca_banks *poll = mcabanks_alloc();
-    struct mca_banks *clr = mcabanks_alloc();
+    struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc();
+    struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc();

     if ( !poll || !clr )
     {
@@ -725,7 +728,13 @@ static int cpu_callback(

     case CPU_UP_CANCELED:
     case CPU_DEAD:
-        cpu_bank_free(cpu);
+        if ( !park_offline_cpus )
+            cpu_bank_free(cpu);
+        break;
+
+    case CPU_REMOVE:
+        if ( park_offline_cpus )
+            cpu_bank_free(cpu);
         break;
     }

diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
index e5dd956a24..4474a34e34 100644
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -636,8 +636,6 @@ static void clear_cmci(void)

 static void cpu_mcheck_disable(void)
 {
-    clear_in_cr4(X86_CR4_MCE);
-
     if ( cmci_support && opt_mce )
         clear_cmci();
 }
diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c
index 207e2e712c..6e27f6ec8e 100644
--- a/xen/arch/x86/cpu/vpmu_intel.c
+++ b/xen/arch/x86/cpu/vpmu_intel.c
@@ -454,13 +454,11 @@ static int core2_vpmu_alloc_resource(struct vcpu *v)

     if ( is_hvm_vcpu(v) )
     {
-        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-        if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+        if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) )
             goto out_err;

-        if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+        if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) )
             goto out_err;
-        vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
     }

     core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) +
@@ -535,27 +533,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
     uint64_t *enabled_cntrs;

     if ( !core2_vpmu_msr_common_check(msr, &type, &index) )
-    {
-        /* Special handling for BTS */
-        if ( msr == MSR_IA32_DEBUGCTLMSR )
-        {
-            supported |= IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS |
-                         IA32_DEBUGCTLMSR_BTINT;
-
-            if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
-                supported |= IA32_DEBUGCTLMSR_BTS_OFF_OS |
-                             IA32_DEBUGCTLMSR_BTS_OFF_USR;
-            if ( !(msr_content & ~supported) &&
-                 vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
-                return 0;
-            if ( (msr_content & supported) &&
-                 !vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
-                printk(XENLOG_G_WARNING
-                       "%pv: Debug Store unsupported on this CPU\n",
-                       current);
-        }
         return -EINVAL;
-    }

     ASSERT(!supported);

@@ -613,7 +591,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
             return -EINVAL;

         if ( is_hvm_vcpu(v) )
-            vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
+            vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL,
                                &core2_vpmu_cxt->global_ctrl);
         else
             rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
@@ -682,7 +660,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
                 return -EINVAL;

             if ( is_hvm_vcpu(v) )
-                vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
+                vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL,
                                    &core2_vpmu_cxt->global_ctrl);
             else
                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
@@ -701,7 +679,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
     else
     {
         if ( is_hvm_vcpu(v) )
-            vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+            vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
         else
             wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
     }
@@ -735,7 +713,7 @@ static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
             break;
         case MSR_CORE_PERF_GLOBAL_CTRL:
             if ( is_hvm_vcpu(v) )
-                vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+                vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
             else
                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, *msr_content);
             break;
diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
index beee47d0ed..5cc89e2b34 100644
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -43,6 +43,11 @@ static int __init parse_xen_cpuid(const char *s)
             if ( !val )
                 setup_clear_cpu_cap(X86_FEATURE_STIBP);
         }
+        else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
+        {
+            if ( !val )
+                setup_clear_cpu_cap(X86_FEATURE_L1D_FLUSH);
+        }
         else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 )
         {
             if ( !val )
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 9850a782ec..c39cf2c6e5 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -107,10 +107,11 @@ static void play_dead(void)
     local_irq_disable();

     /*
-     * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible,
-     * as they may be freed at any time. In this case, heap corruption or
-     * #PF can occur (when heap debugging is enabled). For example, even
-     * printk() can involve tasklet scheduling, which touches per-cpu vars.
+     * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible,
+     * as they may be freed at any time if offline CPUs don't get parked. In
+     * this case, heap corruption or #PF can occur (when heap debugging is
+     * enabled). For example, even printk() can involve tasklet scheduling,
+     * which touches per-cpu vars.
      *
      * Consider very carefully when adding code to *dead_idle. Most hypervisor
      * subsystems are unsafe to call.
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 8fbbf3aeb3..dd91038a67 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -225,7 +225,8 @@ static int update_domain_cpuid_info(struct domain *d,
          */
         call_policy_changed = (is_hvm_domain(d) &&
                                ((old_7d0 ^ p->feat.raw[0].d) &
-                                cpufeat_mask(X86_FEATURE_IBRSB)));
+                                (cpufeat_mask(X86_FEATURE_IBRSB) |
+                                 cpufeat_mask(X86_FEATURE_L1D_FLUSH))));
         break;

     case 0xa:
@@ -1163,7 +1164,7 @@ long arch_do_domctl(
             if ( _xcr0_accum )
             {
                 if ( evc->size >= PV_XSAVE_HDR_SIZE + XSTATE_AREA_MIN_SIZE )
-                    ret = validate_xstate(_xcr0, _xcr0_accum,
+                    ret = validate_xstate(d, _xcr0, _xcr0_accum,
                                           &_xsave_area->xsave_hdr);
             }
             else if ( !_xcr0 )
@@ -1187,8 +1188,7 @@ long arch_do_domctl(
                 vcpu_pause(v);
                 v->arch.xcr0 = _xcr0;
                 v->arch.xcr0_accum = _xcr0_accum;
-                if ( _xcr0_accum & XSTATE_NONLAZY )
-                    v->arch.nonlazy_xstate_used = 1;
+                v->arch.nonlazy_xstate_used = _xcr0_accum & XSTATE_NONLAZY;
                 compress_xsave_states(v, _xsave_area,
                                       evc->size - PV_XSAVE_HDR_SIZE);
                 vcpu_unpause(v);
diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
index 4779b0d0d5..d997806272 100644
--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -201,18 +201,21 @@ static int update_clusterinfo(
         if ( !cluster_cpus_spare )
             cluster_cpus_spare = xzalloc(cpumask_t);
         if ( !cluster_cpus_spare ||
-             !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
+             !cond_alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
             err = -ENOMEM;
         break;
     case CPU_UP_CANCELED:
     case CPU_DEAD:
+    case CPU_REMOVE:
+        if ( park_offline_cpus == (action != CPU_REMOVE) )
+            break;
         if ( per_cpu(cluster_cpus, cpu) )
         {
             cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu));
             if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) )
-                xfree(per_cpu(cluster_cpus, cpu));
+                XFREE(per_cpu(cluster_cpus, cpu));
         }
-        free_cpumask_var(per_cpu(scratch_mask, cpu));
+        FREE_CPUMASK_VAR(per_cpu(scratch_mask, cpu));
         break;
     }

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index c23983cdff..4cbb688c05 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -907,6 +907,9 @@ const char *hvm_efer_valid(const struct vcpu *v, uint64_t value,
     else
         p = &host_cpuid_policy;

+    if ( value & ~EFER_KNOWN_MASK )
+        return "Unknown bits set";
+
     if ( (value & EFER_SCE) && !p->extd.syscall )
         return "SCE without feature";

@@ -1269,7 +1272,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
     ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
     h->cur += desc->length;

-    err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum,
+    err = validate_xstate(d, ctxt->xcr0, ctxt->xcr0_accum,
                           (const void *)&ctxt->save_area.xsave_hdr);
     if ( err )
     {
@@ -1324,8 +1327,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)

     v->arch.xcr0 = ctxt->xcr0;
     v->arch.xcr0_accum = ctxt->xcr0_accum;
-    if ( ctxt->xcr0_accum & XSTATE_NONLAZY )
-        v->arch.nonlazy_xstate_used = 1;
+    v->arch.nonlazy_xstate_used = ctxt->xcr0_accum & XSTATE_NONLAZY;
     compress_xsave_states(v, &ctxt->save_area,
                           size - offsetof(struct hvm_hw_cpu_xsave, save_area));

diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index 165500e3f2..b964c59dad 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1432,24 +1432,18 @@ static void svm_inject_event(const struct x86_event *event)
      * Xen must emulate enough of the event injection to be sure that a
      * further fault shouldn't occur during delivery.  This covers the fact
      * that hardware doesn't perform DPL checking on injection.
-     *
-     * Also, it accounts for proper positioning of %rip for an event with trap
-     * semantics (where %rip should point after the instruction) which suffers
-     * a fault during injection (at which point %rip should point at the
-     * instruction).
      */
     if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION ||
-         (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT ||
-                                 event->type == X86_EVENTTYPE_SW_EXCEPTION)) )
+         (!cpu_has_svm_nrips && (event->type >= X86_EVENTTYPE_SW_INTERRUPT)) )
         svm_emul_swint_injection(&_event);

-    switch ( _event.vector )
+    switch ( _event.vector | -(_event.type == X86_EVENTTYPE_SW_INTERRUPT) )
     {
     case TRAP_debug:
         if ( regs->eflags & X86_EFLAGS_TF )
         {
             __restore_debug_registers(vmcb, curr);
-            vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000);
+            vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | DR_STEP);
         }
         /* fall through */
     case TRAP_int3:
@@ -1459,6 +1453,13 @@ static void svm_inject_event(const struct x86_event *event)
             domain_pause_for_debugger();
             return;
         }
+        break;
+
+    case TRAP_page_fault:
+        ASSERT(_event.type == X86_EVENTTYPE_HW_EXCEPTION);
+        curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
+        vmcb_set_cr2(vmcb, _event.cr2);
+        break;
     }

     if ( unlikely(eventinj.fields.v) &&
@@ -1481,13 +1482,9 @@ static void svm_inject_event(const struct x86_event *event)
      * icebp, software events with trap semantics need emulating, so %rip in
      * the trap frame points after the instruction.
      *
-     * The x86 emulator (if requested by the x86_swint_emulate_* choice) will
-     * have performed checks such as presence/dpl/etc and believes that the
-     * event injection will succeed without faulting.
-     *
-     * The x86 emulator will always provide fault semantics for software
-     * events, with _trap.insn_len set appropriately.  If the injection
-     * requires emulation, move %rip forwards at this point.
+     * svm_emul_swint_injection() has already confirmed that events with trap
+     * semantics won't fault on injection.  Position %rip/NextRIP suitably,
+     * and restrict the event type to what hardware will tolerate.
      */
     switch ( _event.type )
     {
@@ -1544,16 +1541,12 @@ static void svm_inject_event(const struct x86_event *event)
            eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode);
     vmcb->eventinj = eventinj;

-    if ( _event.vector == TRAP_page_fault )
-    {
-        curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
-        vmcb_set_cr2(vmcb, _event.cr2);
-        HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2));
-    }
+    if ( _event.vector == TRAP_page_fault &&
+         _event.type == X86_EVENTTYPE_HW_EXCEPTION )
+        HVMTRACE_LONG_2D(PF_INJECT, _event.error_code,
+                         TRC_PAR_LONG(_event.cr2));
     else
-    {
         HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code);
-    }
 }

 static int svm_event_pending(struct vcpu *v)
diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S
index aa2f103895..afd552f2b9 100644
--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -41,6 +41,15 @@ ENTRY(vmx_asm_vmexit_handler)
         SPEC_CTRL_ENTRY_FROM_HVM    /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */

+        /* Hardware clears MSR_DEBUGCTL on VMExit.  Reinstate it if debugging Xen. */
+        .macro restore_lbr
+            mov $IA32_DEBUGCTLMSR_LBR, %eax
+            mov $MSR_IA32_DEBUGCTLMSR, %ecx
+            xor %edx, %edx
+            wrmsr
+        .endm
+        ALTERNATIVE "", restore_lbr, X86_FEATURE_XEN_LBR
+
         mov  %rsp,%rdi
         call vmx_vmexit_handler

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 258fc08f72..2ba0c40808 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -38,6 +38,7 @@
 #include <asm/flushtlb.h>
 #include <asm/monitor.h>
 #include <asm/shadow.h>
+#include <asm/spec_ctrl.h>
 #include <asm/tboot.h>
 #include <asm/apic.h>

@@ -996,6 +997,7 @@ static int construct_vmcs(struct vcpu *v)
     struct domain *d = v->domain;
     u32 vmexit_ctl = vmx_vmexit_control;
     u32 vmentry_ctl = vmx_vmentry_control;
+    int rc = 0;

     vmx_vmcs_enter(v);

@@ -1083,8 +1085,8 @@ static int construct_vmcs(struct vcpu *v)

         if ( msr_bitmap == NULL )
         {
-            vmx_vmcs_exit(v);
-            return -ENOMEM;
+            rc = -ENOMEM;
+            goto out;
         }

         memset(msr_bitmap, ~0, PAGE_SIZE);
@@ -1268,141 +1270,197 @@ static int construct_vmcs(struct vcpu *v)
     if ( cpu_has_vmx_tsc_scaling )
         __vmwrite(TSC_MULTIPLIER, d->arch.hvm_domain.tsc_scaling_ratio);

-    vmx_vmcs_exit(v);
-
     /* will update HOST & GUEST_CR3 as reqd */
     paging_update_paging_modes(v);

     vmx_vlapic_msr_changed(v);

-    return 0;
+    if ( opt_l1d_flush && paging_mode_hap(d) )
+        rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
+                         VMX_MSR_GUEST_LOADONLY);
+
+ out:
+    vmx_vmcs_exit(v);
+
+    return rc;
 }

-static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
+/*
+ * Search an MSR list looking for an MSR entry, or the slot in which it should
+ * live (to keep the data sorted) if an entry is not found.
+ *
+ * The return pointer is guaranteed to be bounded by start and end.  However,
+ * it may point at end, and may be invalid for the caller to dereference.
+ */
+static struct vmx_msr_entry *locate_msr_entry(
+    struct vmx_msr_entry *start, struct vmx_msr_entry *end, uint32_t msr)
 {
-    const u32 *msr = key;
-    const struct vmx_msr_entry *entry = elt;
+    while ( start < end )
+    {
+        struct vmx_msr_entry *mid = start + (end - start) / 2;

-    if ( *msr > entry->index )
-        return 1;
-    if ( *msr < entry->index )
-        return -1;
+        if ( msr < mid->index )
+            end = mid;
+        else if ( msr > mid->index )
+            start = mid + 1;
+        else
+            return mid;
+    }

-    return 0;
+    return start;
 }

-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type)
+struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+                                   enum vmx_msr_list_type type)
 {
-    struct vcpu *curr = current;
-    unsigned int msr_count;
-    struct vmx_msr_entry *msr_area;
+    const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
+    struct vmx_msr_entry *start = NULL, *ent, *end;
+    unsigned int substart = 0, subend = vmx->msr_save_count;
+    unsigned int total = vmx->msr_load_count;

-    if ( type == VMX_GUEST_MSR )
-    {
-        msr_count = curr->arch.hvm_vmx.msr_count;
-        msr_area = curr->arch.hvm_vmx.msr_area;
-    }
-    else
+    ASSERT(v == current || !vcpu_runnable(v));
+
+    switch ( type )
     {
-        ASSERT(type == VMX_HOST_MSR);
-        msr_count = curr->arch.hvm_vmx.host_msr_count;
-        msr_area = curr->arch.hvm_vmx.host_msr_area;
+    case VMX_MSR_HOST:
+        start    = vmx->host_msr_area;
+        subend   = vmx->host_msr_count;
+        total    = subend;
+        break;
+
+    case VMX_MSR_GUEST:
+        start    = vmx->msr_area;
+        break;
+
+    case VMX_MSR_GUEST_LOADONLY:
+        start    = vmx->msr_area;
+        substart = subend;
+        subend   = total;
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
     }

-    if ( msr_area == NULL )
+    if ( !start )
         return NULL;

-    return bsearch(&msr, msr_area, msr_count, sizeof(struct vmx_msr_entry),
-                   vmx_msr_entry_key_cmp);
+    end = start + total;
+    ent = locate_msr_entry(start + substart, start + subend, msr);
+
+    return ((ent < end) && (ent->index == msr)) ? ent : NULL;
 }

-int vmx_read_guest_msr(u32 msr, u64 *val)
+int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
+                enum vmx_msr_list_type type)
 {
-    struct vmx_msr_entry *ent;
+    struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
+    struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
+    unsigned int substart, subend, total;
+    int rc;

-    if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
+    ASSERT(v == current || !vcpu_runnable(v));
+
+    switch ( type )
     {
-        *val = ent->data;
-        return 0;
-    }
+    case VMX_MSR_HOST:
+        ptr      = &vmx->host_msr_area;
+        substart = 0;
+        subend   = vmx->host_msr_count;
+        total    = subend;
+        break;

-    return -ESRCH;
-}
+    case VMX_MSR_GUEST:
+        ptr      = &vmx->msr_area;
+        substart = 0;
+        subend   = vmx->msr_save_count;
+        total    = vmx->msr_load_count;
+        break;

-int vmx_write_guest_msr(u32 msr, u64 val)
-{
-    struct vmx_msr_entry *ent;
+    case VMX_MSR_GUEST_LOADONLY:
+        ptr      = &vmx->msr_area;
+        substart = vmx->msr_save_count;
+        subend   = vmx->msr_load_count;
+        total    = subend;
+        break;

-    if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
-    {
-        ent->data = val;
-        return 0;
+    default:
+        ASSERT_UNREACHABLE();
+        return -EINVAL;
     }

-    return -ESRCH;
-}
-
-int vmx_add_msr(u32 msr, int type)
-{
-    struct vcpu *curr = current;
-    unsigned int idx, *msr_count;
-    struct vmx_msr_entry **msr_area, *msr_area_elem;
+    vmx_vmcs_enter(v);

-    if ( type == VMX_GUEST_MSR )
-    {
-        msr_count = &curr->arch.hvm_vmx.msr_count;
-        msr_area = &curr->arch.hvm_vmx.msr_area;
-    }
-    else
+    /* Allocate memory on first use. */
+    if ( unlikely(!*ptr) )
     {
-        ASSERT(type == VMX_HOST_MSR);
-        msr_count = &curr->arch.hvm_vmx.host_msr_count;
-        msr_area = &curr->arch.hvm_vmx.host_msr_area;
-    }
+        paddr_t addr;

-    if ( *msr_area == NULL )
-    {
-        if ( (*msr_area = alloc_xenheap_page()) == NULL )
-            return -ENOMEM;
+        if ( (*ptr = alloc_xenheap_page()) == NULL )
+        {
+            rc = -ENOMEM;
+            goto out;
+        }

-        if ( type == VMX_GUEST_MSR )
+        addr = virt_to_maddr(*ptr);
+
+        switch ( type )
         {
-            __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area));
-            __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
+        case VMX_MSR_HOST:
+            __vmwrite(VM_EXIT_MSR_LOAD_ADDR, addr);
+            break;
+
+        case VMX_MSR_GUEST:
+        case VMX_MSR_GUEST_LOADONLY:
+            __vmwrite(VM_EXIT_MSR_STORE_ADDR, addr);
+            __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, addr);
+            break;
         }
-        else
-            __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
     }

-    for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ )
-        if ( (*msr_area)[idx].index == msr )
-            return 0;
+    start = *ptr;
+    end   = start + total;
+    ent   = locate_msr_entry(start + substart, start + subend, msr);

-    if ( *msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
-        return -ENOSPC;
+    if ( (ent < end) && (ent->index == msr) )
+        goto found;

-    memmove(*msr_area + idx + 1, *msr_area + idx,
-            sizeof(*msr_area_elem) * (*msr_count - idx));
+    /* If there isn't an existing entry for msr, insert room for one. */
+    if ( total == (PAGE_SIZE / sizeof(*ent)) )
+    {
+        rc = -ENOSPC;
+        goto out;
+    }

-    msr_area_elem = *msr_area + idx;
-    msr_area_elem->index = msr;
-    msr_area_elem->mbz = 0;
+    memmove(ent + 1, ent, sizeof(*ent) * (end - ent));

-    ++*msr_count;
+    ent->index = msr;
+    ent->mbz = 0;

-    if ( type == VMX_GUEST_MSR )
+    switch ( type )
     {
-        msr_area_elem->data = 0;
-        __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count);
-        __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count);
-    }
-    else
-    {
-        rdmsrl(msr, msr_area_elem->data);
-        __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
+    case VMX_MSR_HOST:
+        __vmwrite(VM_EXIT_MSR_LOAD_COUNT, ++vmx->host_msr_count);
+        break;
+
+    case VMX_MSR_GUEST:
+        __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_save_count);
+
+        /* Fallthrough */
+    case VMX_MSR_GUEST_LOADONLY:
+        __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, ++vmx->msr_load_count);
+        break;
     }

-    return 0;
+    /* Set the msr's value. */
+ found:
+    ent->data = val;
+    rc = 0;
+
+ out:
+    vmx_vmcs_exit(v);
+
+    return rc;
 }

 void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector)
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 610c8d6eb9..b0fababede 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -583,6 +583,12 @@ static void vmx_cpuid_policy_changed(struct vcpu *v)
         vmx_clear_msr_intercept(v, MSR_PRED_CMD,  VMX_MSR_RW);
     else
         vmx_set_msr_intercept(v, MSR_PRED_CMD,  VMX_MSR_RW);
+
+    /* MSR_FLUSH_CMD is safe to pass through if the guest knows about it. */
+    if ( cp->feat.l1d_flush )
+        vmx_clear_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
+    else
+        vmx_set_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
 }

 int vmx_guest_x86_mode(struct vcpu *v)
@@ -2758,8 +2764,10 @@ enum

 #define LBR_FROM_SIGNEXT_2MSB  ((1ULL << 59) | (1ULL << 60))

-#define FIXUP_LBR_TSX            (1u << 0)
-#define FIXUP_BDW_ERRATUM_BDF14  (1u << 1)
+#define LBR_MSRS_INSERTED      (1u << 0)
+#define LBR_FIXUP_TSX          (1u << 1)
+#define LBR_FIXUP_BDF14        (1u << 2)
+#define LBR_FIXUP_MASK         (LBR_FIXUP_TSX | LBR_FIXUP_BDF14)

 static bool __read_mostly lbr_tsx_fixup_needed;
 static bool __read_mostly bdw_erratum_bdf14_fixup_needed;
@@ -2822,7 +2830,7 @@ static int is_last_branch_msr(u32 ecx)

 static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
 {
-    const struct vcpu *curr = current;
+    struct vcpu *curr = current;

     HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x", msr);

@@ -2901,7 +2909,7 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
         if ( passive_domain_do_rdmsr(msr, msr_content) )
             goto done;

-        if ( vmx_read_guest_msr(msr, msr_content) == 0 )
+        if ( vmx_read_guest_msr(curr, msr, msr_content) == 0 )
             break;

         if ( is_last_branch_msr(msr) )
@@ -3036,11 +3044,14 @@ void vmx_vlapic_msr_changed(struct vcpu *v)
 static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
 {
     struct vcpu *v = current;
+    const struct cpuid_policy *cp = v->domain->arch.cpuid;

     HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x, msr_value=%#"PRIx64, msr, msr_content);

     switch ( msr )
     {
+        uint64_t rsvd;
+
     case MSR_IA32_SYSENTER_CS:
         __vmwrite(GUEST_SYSENTER_CS, msr_content);
         break;
@@ -3093,45 +3104,85 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
         wrmsrl(MSR_SYSCALL_MASK, msr_content);
         break;

-    case MSR_IA32_DEBUGCTLMSR: {
-        int i, rc = 0;
-        uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF;
+    case MSR_IA32_DEBUGCTLMSR:
+        rsvd = ~(IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF);

-        if ( boot_cpu_has(X86_FEATURE_RTM) )
-            supported |= IA32_DEBUGCTLMSR_RTM;
-        if ( msr_content & ~supported )
+        /* TODO: Wire vPMU settings properly through the CPUID policy */
+        if ( vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_BTS) )
         {
-            /* Perhaps some other bits are supported in vpmu. */
-            if ( vpmu_do_wrmsr(msr, msr_content, supported) )
-                break;
+            rsvd &= ~(IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS |
+                      IA32_DEBUGCTLMSR_BTINT);
+
+            if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
+                rsvd &= ~(IA32_DEBUGCTLMSR_BTS_OFF_OS |
+                          IA32_DEBUGCTLMSR_BTS_OFF_USR);
         }
-        if ( msr_content & IA32_DEBUGCTLMSR_LBR )
+
+        if ( cp->feat.rtm )
+            rsvd &= ~IA32_DEBUGCTLMSR_RTM;
+
+        if ( msr_content & rsvd )
+            goto gp_fault;
+
+        /*
+         * When a guest first enables LBR, arrange to save and restore the LBR
+         * MSRs and allow the guest direct access.
+         *
+         * MSR_DEBUGCTL and LBR has existed almost as long as MSRs have
+         * existed, and there is no architectural way to hide the feature, or
+         * fail the attempt to enable LBR.
+         *
+         * Unknown host LBR MSRs or hitting -ENOSPC with the guest load/save
+         * list are definitely hypervisor bugs, whereas -ENOMEM for allocating
+         * the load/save list is simply unlucky (and shouldn't occur with
+         * sensible management by the toolstack).
+         *
+         * Either way, there is nothing we can do right now to recover, and
+         * the guest won't execute correctly either.  Simply crash the domain
+         * to make the failure obvious.
+         */
+        if ( !(v->arch.hvm_vmx.lbr_flags & LBR_MSRS_INSERTED) &&
+             (msr_content & IA32_DEBUGCTLMSR_LBR) )
         {
             const struct lbr_info *lbr = last_branch_msr_get();
-            if ( lbr == NULL )
-                break;

-            for ( ; (rc == 0) && lbr->count; lbr++ )
-                for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
-                    if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
+            if ( unlikely(!lbr) )
+            {
+                gprintk(XENLOG_ERR, "Unknown Host LBR MSRs\n");
+                domain_crash(v->domain);
+                return X86EMUL_OKAY;
+            }
+
+            for ( ; lbr->count; lbr++ )
+            {
+                unsigned int i;
+
+                for ( i = 0; i < lbr->count; i++ )
+                {
+                    int rc = vmx_add_guest_msr(v, lbr->base + i, 0);
+
+                    if ( unlikely(rc) )
                     {
-                        vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
-                        if ( lbr_tsx_fixup_needed )
-                            v->arch.hvm_vmx.lbr_fixup_enabled |= FIXUP_LBR_TSX;
-                        if ( bdw_erratum_bdf14_fixup_needed )
-                            v->arch.hvm_vmx.lbr_fixup_enabled |=
-                                FIXUP_BDW_ERRATUM_BDF14;
+                        gprintk(XENLOG_ERR,
+                                "Guest load/save list error %d\n", rc);
+                        domain_crash(v->domain);
+                        return X86EMUL_OKAY;
                     }
-        }

-        if ( (rc < 0) ||
-             (msr_content && (vmx_add_host_load_msr(msr) < 0)) )
-            hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC);
-        else
-            __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
+                    vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
+                }
+            }
+
+            v->arch.hvm_vmx.lbr_flags |= LBR_MSRS_INSERTED;
+            if ( lbr_tsx_fixup_needed )
+                v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_TSX;
+            if ( bdw_erratum_bdf14_fixup_needed )
+                v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_BDF14;
+        }

+        __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
         break;
-    }
+
     case MSR_IA32_FEATURE_CONTROL:
     case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
         /* None of these MSRs are writeable. */
@@ -3154,7 +3205,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
         if ( wrmsr_viridian_regs(msr, msr_content) )
             break;

-        if ( vmx_write_guest_msr(msr, msr_content) == 0 ||
+        if ( vmx_write_guest_msr(v, msr, msr_content) == 0 ||
              is_last_branch_msr(msr) )
             break;

@@ -3701,6 +3752,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
              */
             __vmread(EXIT_QUALIFICATION, &exit_qualification);
             HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
+            __restore_debug_registers(v);
             write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
             if ( !v->domain->debugger_attached )
             {
@@ -4165,11 +4217,11 @@ out:
 static void lbr_tsx_fixup(void)
 {
     struct vcpu *curr = current;
-    unsigned int msr_count = curr->arch.hvm_vmx.msr_count;
+    unsigned int msr_count = curr->arch.hvm_vmx.msr_save_count;
     struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
     struct vmx_msr_entry *msr;

-    if ( (msr = vmx_find_msr(lbr_from_start, VMX_GUEST_MSR)) != NULL )
+    if ( (msr = vmx_find_msr(curr, lbr_from_start, VMX_MSR_GUEST)) != NULL )
     {
         /*
          * Sign extend into bits 61:62 while preserving bit 63
@@ -4179,15 +4231,15 @@ static void lbr_tsx_fixup(void)
             msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
     }

-    if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_GUEST_MSR)) != NULL )
+    if ( (msr = vmx_find_msr(curr, lbr_lastint_from, VMX_MSR_GUEST)) != NULL )
         msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
 }

-static void sign_extend_msr(u32 msr, int type)
+static void sign_extend_msr(struct vcpu *v, u32 msr, int type)
 {
     struct vmx_msr_entry *entry;

-    if ( (entry = vmx_find_msr(msr, type)) != NULL )
+    if ( (entry = vmx_find_msr(v, msr, type)) != NULL )
     {
         if ( entry->data & VADDR_TOP_BIT )
             entry->data |= CANONICAL_MASK;
@@ -4198,6 +4250,8 @@ static void sign_extend_msr(u32 msr, int type)

 static void bdw_erratum_bdf14_fixup(void)
 {
+    struct vcpu *curr = current;
+
     /*
      * Occasionally, on certain Broadwell CPUs MSR_IA32_LASTINTTOIP has
      * been observed to have the top three bits corrupted as though the
@@ -4207,17 +4261,17 @@ static void bdw_erratum_bdf14_fixup(void)
      * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by
      * sign-extending into bits 48:63.
      */
-    sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_GUEST_MSR);
-    sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_GUEST_MSR);
+    sign_extend_msr(curr, MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST);
+    sign_extend_msr(curr, MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST);
 }

 static void lbr_fixup(void)
 {
     struct vcpu *curr = current;

-    if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_LBR_TSX )
+    if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_TSX )
         lbr_tsx_fixup();
-    if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_BDW_ERRATUM_BDF14 )
+    if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_BDF14 )
         bdw_erratum_bdf14_fixup();
 }

@@ -4285,7 +4339,7 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs)
     }

  out:
-    if ( unlikely(curr->arch.hvm_vmx.lbr_fixup_enabled) )
+    if ( unlikely(curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_MASK) )
         lbr_fixup();

     HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index bcf46c0743..7d4871b791 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -613,6 +613,9 @@ static int alloc_segdesc_page(struct page_info *page)
     return i == 512 ? 0 : -EINVAL;
 }

+static int _get_page_type(struct page_info *page, unsigned long type,
+                          bool preemptible);
+
 static int get_page_and_type_from_mfn(
     mfn_t mfn, unsigned long type, struct domain *d,
     int partial, int preemptible)
@@ -624,9 +627,7 @@ static int get_page_and_type_from_mfn(
          unlikely(!get_page_from_mfn(mfn, d)) )
         return -EINVAL;

-    rc = (preemptible ?
-          get_page_type_preemptible(page, type) :
-          (get_page_type(page, type) ? 0 : -EINVAL));
+    rc = _get_page_type(page, type, preemptible);

     if ( unlikely(rc) && partial >= 0 &&
          (!preemptible || page != current->arch.old_guest_table) )
@@ -1115,7 +1116,7 @@ get_page_from_l2e(
     int rc;

     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        return 1;
+        return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1;

     if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
     {
@@ -1146,7 +1147,7 @@ get_page_from_l3e(
     int rc;

     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        return 1;
+        return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1;

     if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
     {
@@ -1179,7 +1180,7 @@ get_page_from_l4e(
     int rc;

     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return 1;
+        return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1;

     if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
     {
@@ -1389,6 +1390,13 @@ static int alloc_l1_table(struct page_info *page)

     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
     {
+        if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
+        {
+            ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
+            if ( ret )
+                goto out;
+        }
+
         switch ( ret = get_page_from_l1e(pl1e[i], d, d) )
         {
         default:
@@ -1409,6 +1417,7 @@ static int alloc_l1_table(struct page_info *page)

  fail:
     gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
+ out:
     while ( i-- > 0 )
         put_page_from_l1e(pl1e[i], d);

@@ -1456,8 +1465,7 @@ static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
     return 1;
 }

-static int alloc_l2_table(struct page_info *page, unsigned long type,
-                          int preemptible)
+static int alloc_l2_table(struct page_info *page, unsigned long type)
 {
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = mfn_x(page_to_mfn(page));
@@ -1469,8 +1477,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,

     for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
     {
-        if ( preemptible && i > page->nr_validated_ptes
-             && hypercall_preempt_check() )
+        if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
         {
             page->nr_validated_ptes = i;
             rc = -ERESTART;
@@ -1481,6 +1488,12 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,
              (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
             continue;

+        if ( unlikely(rc == -ERESTART) )
+        {
+            page->nr_validated_ptes = i;
+            break;
+        }
+
         if ( rc < 0 )
         {
             gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
@@ -1763,7 +1776,7 @@ static void free_l1_table(struct page_info *page)
 }


-static int free_l2_table(struct page_info *page, int preemptible)
+static int free_l2_table(struct page_info *page)
 {
     struct domain *d = page_get_owner(page);
     unsigned long pfn = mfn_x(page_to_mfn(page));
@@ -1777,7 +1790,7 @@ static int free_l2_table(struct page_info *page, int preemptible)
     do {
         if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
              put_page_from_l2e(pl2e[i], pfn) == 0 &&
-             preemptible && i && hypercall_preempt_check() )
+             i && hypercall_preempt_check() )
         {
            page->nr_validated_ptes = i;
            err = -ERESTART;
@@ -2055,6 +2068,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
             rc = -EBUSY;
         }
     }
+    else if ( pv_l1tf_check_l1e(pt_dom, nl1e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
                                      preserve_ad)) )
     {
@@ -2118,6 +2133,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
             rc = -EBUSY;
         }
     }
+    else if ( pv_l1tf_check_l2e(d, nl2e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
                                      preserve_ad)) )
     {
@@ -2179,6 +2196,8 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
             rc = -EFAULT;
         }
     }
+    else if ( pv_l1tf_check_l3e(d, nl3e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
                                      preserve_ad)) )
     {
@@ -2244,6 +2263,8 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
             rc = -EFAULT;
         }
     }
+    else if ( pv_l1tf_check_l4e(d, nl4e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
                                      preserve_ad)) )
     {
@@ -2373,7 +2394,8 @@ static int alloc_page_type(struct page_info *page, unsigned long type,
         rc = alloc_l1_table(page);
         break;
     case PGT_l2_page_table:
-        rc = alloc_l2_table(page, type, preemptible);
+        ASSERT(preemptible);
+        rc = alloc_l2_table(page, type);
         break;
     case PGT_l3_page_table:
         ASSERT(preemptible);
@@ -2463,7 +2485,8 @@ int free_page_type(struct page_info *page, unsigned long type,
         rc = 0;
         break;
     case PGT_l2_page_table:
-        rc = free_l2_table(page, preemptible);
+        ASSERT(preemptible);
+        rc = free_l2_table(page);
         break;
     case PGT_l3_page_table:
         ASSERT(preemptible);
@@ -3550,12 +3573,9 @@ long do_mmuext_op(
     }

     if ( rc == -ERESTART )
-    {
-        ASSERT(i < count);
         rc = hypercall_create_continuation(
             __HYPERVISOR_mmuext_op, "hihi",
             uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
-    }
     else if ( curr->arch.old_guest_table )
     {
         XEN_GUEST_HANDLE_PARAM(void) null;
@@ -3861,12 +3881,9 @@ long do_mmu_update(
     }

     if ( rc == -ERESTART )
-    {
-        ASSERT(i < count);
         rc = hypercall_create_continuation(
             __HYPERVISOR_mmu_update, "hihi",
             ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
-    }
     else if ( curr->arch.old_guest_table )
     {
         XEN_GUEST_HANDLE_PARAM(void) null;
@@ -4121,7 +4138,13 @@ static int __do_update_va_mapping(
 long do_update_va_mapping(unsigned long va, u64 val64,
                           unsigned long flags)
 {
-    return __do_update_va_mapping(va, val64, flags, current->domain);
+    int rc = __do_update_va_mapping(va, val64, flags, current->domain);
+
+    if ( rc == -ERESTART )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_update_va_mapping, "lll", va, val64, flags);
+
+    return rc;
 }

 long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
@@ -4138,6 +4161,46 @@ long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,

     put_pg_owner(pg_owner);

+    if ( rc == -ERESTART )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_update_va_mapping_otherdomain,
+            "llli", va, val64, flags, domid);
+
+    return rc;
+}
+
+int compat_update_va_mapping(unsigned int va, uint32_t lo, uint32_t hi,
+                             unsigned int flags)
+{
+    int rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo,
+                                    flags, current->domain);
+
+    if ( rc == -ERESTART )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_update_va_mapping, "iiii", va, lo, hi, flags);
+
+    return rc;
+}
+
+int compat_update_va_mapping_otherdomain(unsigned int va,
+                                         uint32_t lo, uint32_t hi,
+                                         unsigned int flags, domid_t domid)
+{
+    struct domain *pg_owner;
+    int rc;
+
+    if ( (pg_owner = get_pg_owner(domid)) == NULL )
+        return -ESRCH;
+
+    rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, flags, pg_owner);
+
+    put_pg_owner(pg_owner);
+
+    if ( rc == -ERESTART )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_update_va_mapping_otherdomain,
+            "iiiii", va, lo, hi, flags, domid);
+
     return rc;
 }

diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index 2b0445ffe9..dcee496eb0 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -873,6 +873,8 @@ void paging_dump_domain_info(struct domain *d)
         printk("    paging assistance: ");
         if ( paging_mode_shadow(d) )
             printk("shadow ");
+        if ( paging_mode_sh_forced(d) )
+            printk("forced ");
         if ( paging_mode_hap(d) )
             printk("hap ");
         if ( paging_mode_refcounts(d) )
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index dd61b50eb7..fd42d734e7 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -3177,6 +3177,15 @@ static void sh_new_mode(struct domain *d, u32 new_mode)
     ASSERT(paging_locked_by_me(d));
     ASSERT(d != current->domain);

+    /*
+     * If PG_SH_forced has previously been activated because of writing an
+     * L1TF-vulnerable PTE, it must remain active for the remaining lifetime
+     * of the domain, even if the logdirty mode needs to be controlled for
+     * migration purposes.
+     */
+    if ( paging_mode_sh_forced(d) )
+        new_mode |= PG_SH_forced | PG_SH_enable;
+
     d->arch.paging.mode = new_mode;
     for_each_vcpu(d, v)
         sh_update_paging_modes(v);
@@ -4057,6 +4066,33 @@ void shadow_audit_tables(struct vcpu *v)

 #endif /* Shadow audit */

+#ifdef CONFIG_PV
+
+void pv_l1tf_tasklet(unsigned long data)
+{
+    struct domain *d = (void *)data;
+
+    domain_pause(d);
+    paging_lock(d);
+
+    if ( !paging_mode_sh_forced(d) && !d->is_dying )
+    {
+        int ret = shadow_one_bit_enable(d, PG_SH_forced);
+
+        if ( ret )
+        {
+            printk(XENLOG_G_ERR "d%d Failed to enable PG_SH_forced: %d\n",
+                   d->domain_id, ret);
+            domain_crash(d);
+        }
+    }
+
+    paging_unlock(d);
+    domain_unpause(d);
+}
+
+#endif /* CONFIG_PV */
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c
index 49140e46f0..f3f6d48668 100644
--- a/xen/arch/x86/mpparse.c
+++ b/xen/arch/x86/mpparse.c
@@ -68,19 +68,26 @@ physid_mask_t phys_cpu_present_map;

 void __init set_nr_cpu_ids(unsigned int max_cpus)
 {
+	unsigned int tot_cpus = num_processors + disabled_cpus;
+
 	if (!max_cpus)
-		max_cpus = num_processors + disabled_cpus;
+		max_cpus = tot_cpus;
 	if (max_cpus > NR_CPUS)
 		max_cpus = NR_CPUS;
 	else if (!max_cpus)
 		max_cpus = 1;
 	printk(XENLOG_INFO "SMP: Allowing %u CPUs (%d hotplug CPUs)\n",
 	       max_cpus, max_t(int, max_cpus - num_processors, 0));
-	nr_cpu_ids = max_cpus;
+
+	if (!park_offline_cpus)
+		tot_cpus = max_cpus;
+	nr_cpu_ids = min(tot_cpus, NR_CPUS + 0u);
+	if (park_offline_cpus && nr_cpu_ids < num_processors)
+		printk(XENLOG_WARNING "SMP: Cannot bring up %u further CPUs\n",
+		       num_processors - nr_cpu_ids);

 #ifndef nr_cpumask_bits
-	nr_cpumask_bits = (max_cpus + (BITS_PER_LONG - 1)) &
-			  ~(BITS_PER_LONG - 1);
+	nr_cpumask_bits = ROUNDUP(nr_cpu_ids, BITS_PER_LONG);
 	printk(XENLOG_DEBUG "NR_CPUS:%u nr_cpumask_bits:%u\n",
 	       NR_CPUS, nr_cpumask_bits);
 #endif
diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
index 1e12ccb729..1a591dd2b5 100644
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -150,6 +150,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
     case MSR_AMD_PATCHLOADER:
     case MSR_IA32_UCODE_WRITE:
     case MSR_PRED_CMD:
+    case MSR_FLUSH_CMD:
         /* Write-only */
         goto gp_fault;

@@ -254,6 +255,17 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
             wrmsrl(MSR_PRED_CMD, val);
         break;

+    case MSR_FLUSH_CMD:
+        if ( !cp->feat.l1d_flush )
+            goto gp_fault; /* MSR available? */
+
+        if ( val & ~FLUSH_CMD_L1D )
+            goto gp_fault; /* Rsvd bit set? */
+
+        if ( v == curr )
+            wrmsrl(MSR_FLUSH_CMD, val);
+        break;
+
     case MSR_INTEL_MISC_FEATURES_ENABLES:
     {
         bool old_cpuid_faulting = vp->misc_features_enables.cpuid_faulting;
diff --git a/xen/arch/x86/oprofile/nmi_int.c b/xen/arch/x86/oprofile/nmi_int.c
index d8f5230906..3dfb8fef93 100644
--- a/xen/arch/x86/oprofile/nmi_int.c
+++ b/xen/arch/x86/oprofile/nmi_int.c
@@ -182,7 +182,7 @@ int nmi_reserve_counters(void)
 	if (!allocate_msrs())
 		return -ENOMEM;

-	/* We walk a thin line between law and rape here.
+	/*
 	 * We need to be careful to install our NMI handler
 	 * without actually triggering any NMIs as this will
 	 * break the core code horrifically.
diff --git a/xen/arch/x86/percpu.c b/xen/arch/x86/percpu.c
index c9997b7937..8be4ebddf4 100644
--- a/xen/arch/x86/percpu.c
+++ b/xen/arch/x86/percpu.c
@@ -28,7 +28,7 @@ static int init_percpu_area(unsigned int cpu)
     char *p;

     if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA )
-        return -EBUSY;
+        return 0;

     if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL )
         return -ENOMEM;
@@ -76,9 +76,12 @@ static int cpu_percpu_callback(
         break;
     case CPU_UP_CANCELED:
     case CPU_DEAD:
-        free_percpu_area(cpu);
+        if ( !park_offline_cpus )
+            free_percpu_area(cpu);
         break;
-    default:
+    case CPU_REMOVE:
+        if ( park_offline_cpus )
+            free_percpu_area(cpu);
         break;
     }

diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index a4f0bd239d..3230ac6a22 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -13,6 +13,7 @@
 #include <asm/invpcid.h>
 #include <asm/spec_ctrl.h>
 #include <asm/pv/domain.h>
+#include <asm/shadow.h>

 static __read_mostly enum {
     PCID_OFF,
@@ -209,6 +210,8 @@ int pv_vcpu_initialise(struct vcpu *v)

 void pv_domain_destroy(struct domain *d)
 {
+    pv_l1tf_domain_destroy(d);
+
     destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
                               GDT_LDT_MBYTES << (20 - PAGE_SHIFT));

@@ -229,6 +232,8 @@ int pv_domain_initialise(struct domain *d)
     };
     int rc = -ENOMEM;

+    pv_l1tf_domain_init(d);
+
     d->arch.pv_domain.gdt_ldt_l1tab =
         alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
     if ( !d->arch.pv_domain.gdt_ldt_l1tab )
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index aa8d5a7556..a3c0c2dd19 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -29,6 +29,7 @@
 #include <asm/mm.h>
 #include <asm/pci.h>
 #include <asm/pv/mm.h>
+#include <asm/shadow.h>

 #include "emulate.h"
 #include "mm.h"
@@ -129,6 +130,10 @@ static int ptwr_emulated_update(unsigned long addr, intpte_t *p_old,

     /* Check the new PTE. */
     nl1e = l1e_from_intpte(val);
+
+    if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) && pv_l1tf_check_l1e(d, nl1e) )
+        return X86EMUL_RETRY;
+
     switch ( ret = get_page_from_l1e(nl1e, d, d) )
     {
     default:
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index a3172ca92c..3cd3e81b30 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -62,6 +62,9 @@ boolean_param("nosmp", opt_nosmp);
 static unsigned int __initdata max_cpus;
 integer_param("maxcpus", max_cpus);

+int8_t __read_mostly opt_smt = -1;
+boolean_param("smt", opt_smt);
+
 /* opt_invpcid: If false, don't use INVPCID instruction even if available. */
 static bool __initdata opt_invpcid = true;
 boolean_param("invpcid", opt_invpcid);
@@ -665,7 +668,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 {
     char *memmap_type = NULL;
     char *cmdline, *kextra, *loader;
-    unsigned int initrdidx;
+    unsigned int initrdidx, num_parked = 0;
     multiboot_info_t *mbi;
     module_t *mod;
     unsigned long nr_pages, raw_max_page, modules_headroom, *module_map;
@@ -909,6 +912,18 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     /* Sanitise the raw E820 map to produce a final clean version. */
     max_page = raw_max_page = init_e820(memmap_type, &e820_raw);

+    if ( !efi_enabled(EFI_BOOT) )
+    {
+        /*
+         * Supplement the heuristics in l1tf_calculations() by assuming that
+         * anything referenced in the E820 may be cacheable.
+         */
+        l1tf_safe_maddr =
+            max(l1tf_safe_maddr,
+                ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr +
+                        e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE));
+    }
+
     /* Create a temporary copy of the E820 map. */
     memcpy(&boot_e820, &e820, sizeof(e820));

@@ -1494,7 +1509,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     else
     {
         set_nr_cpu_ids(max_cpus);
-        max_cpus = nr_cpu_ids;
+        if ( !max_cpus )
+            max_cpus = nr_cpu_ids;
     }

     if ( xen_guest )
@@ -1617,16 +1633,30 @@ void __init noreturn __start_xen(unsigned long mbi_p)
             /* Set up node_to_cpumask based on cpu_to_node[]. */
             numa_add_cpu(i);

-            if ( (num_online_cpus() < max_cpus) && !cpu_online(i) )
+            if ( (park_offline_cpus || num_online_cpus() < max_cpus) &&
+                 !cpu_online(i) )
             {
                 int ret = cpu_up(i);
                 if ( ret != 0 )
                     printk("Failed to bring up CPU %u (error %d)\n", i, ret);
+                else if ( num_online_cpus() > max_cpus ||
+                          (!opt_smt &&
+                           cpu_data[i].compute_unit_id == INVALID_CUID &&
+                           cpumask_weight(per_cpu(cpu_sibling_mask, i)) > 1) )
+                {
+                    ret = cpu_down(i);
+                    if ( !ret )
+                        ++num_parked;
+                    else
+                        printk("Could not re-offline CPU%u (%d)\n", i, ret);
+                }
             }
         }
     }

     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
+    if ( num_parked )
+        printk(XENLOG_INFO "Parked %u CPUs\n", num_parked);
     smp_cpus_done();

     do_initcalls();
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index d4478e6132..7e76cc3d68 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask;
 cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);

+bool __read_mostly park_offline_cpus;
+
 unsigned int __read_mostly nr_sockets;
 cpumask_t **__read_mostly socket_cpumask;
 static cpumask_t *secondary_socket_cpumask;
@@ -234,33 +236,41 @@ static void link_thread_siblings(int cpu1, int cpu2)
     cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1));
 }

-static void set_cpu_sibling_map(int cpu)
+static void set_cpu_sibling_map(unsigned int cpu)
 {
-    int i;
+    unsigned int i;
     struct cpuinfo_x86 *c = cpu_data;

     cpumask_set_cpu(cpu, &cpu_sibling_setup_map);

     cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
+    cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu));
+    cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));

     if ( c[cpu].x86_num_siblings > 1 )
     {
         for_each_cpu ( i, &cpu_sibling_setup_map )
         {
-            if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) {
-                if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
-                     (c[cpu].compute_unit_id == c[i].compute_unit_id) )
+            if ( cpu == i || c[cpu].phys_proc_id != c[i].phys_proc_id )
+                continue;
+            if ( c[cpu].compute_unit_id != INVALID_CUID &&
+                 c[i].compute_unit_id != INVALID_CUID )
+            {
+                if ( c[cpu].compute_unit_id == c[i].compute_unit_id )
                     link_thread_siblings(cpu, i);
-            } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
-                        (c[cpu].cpu_core_id == c[i].cpu_core_id) ) {
-                link_thread_siblings(cpu, i);
             }
+            else if ( c[cpu].cpu_core_id != XEN_INVALID_CORE_ID &&
+                      c[i].cpu_core_id != XEN_INVALID_CORE_ID )
+            {
+                if ( c[cpu].cpu_core_id == c[i].cpu_core_id )
+                    link_thread_siblings(cpu, i);
+            }
+            else
+                printk(XENLOG_WARNING
+                       "CPU%u: unclear relationship with CPU%u\n",
+                       cpu, i);
         }
     }
-    else
-    {
-        cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
-    }

     if ( c[cpu].x86_max_cores == 1 )
     {
@@ -887,7 +897,14 @@ static void cleanup_cpu_root_pgt(unsigned int cpu)
     }
 }

-static void cpu_smpboot_free(unsigned int cpu)
+/*
+ * The 'remove' boolean controls whether a CPU is just getting offlined (and
+ * parked), or outright removed / offlined without parking. Parked CPUs need
+ * things like their stack, GDT, IDT, TSS, and per-CPU data still available.
+ * A few other items, in particular CPU masks, are also retained, as it's
+ * difficult to prove that they're entirely unreferenced from parked CPUs.
+ */
+static void cpu_smpboot_free(unsigned int cpu, bool remove)
 {
     unsigned int order, socket = cpu_to_socket(cpu);
     struct cpuinfo_x86 *c = cpu_data;
@@ -898,15 +915,19 @@ static void cpu_smpboot_free(unsigned int cpu)
         socket_cpumask[socket] = NULL;
     }

-    c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
-    c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
-    c[cpu].compute_unit_id = INVALID_CUID;
     cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);

-    free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
-    free_cpumask_var(per_cpu(cpu_core_mask, cpu));
-    if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
-        free_cpumask_var(per_cpu(scratch_cpumask, cpu));
+    if ( remove )
+    {
+        c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
+        c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
+        c[cpu].compute_unit_id = INVALID_CUID;
+
+        FREE_CPUMASK_VAR(per_cpu(cpu_sibling_mask, cpu));
+        FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu));
+        if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
+            FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu));
+    }

     cleanup_cpu_root_pgt(cpu);

@@ -928,19 +949,21 @@ static void cpu_smpboot_free(unsigned int cpu)
     }

     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
-    free_xenheap_pages(per_cpu(gdt_table, cpu), order);
+    if ( remove )
+        FREE_XENHEAP_PAGES(per_cpu(gdt_table, cpu), order);

     free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order);

-    order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
-    free_xenheap_pages(idt_tables[cpu], order);
-    idt_tables[cpu] = NULL;
-
-    if ( stack_base[cpu] != NULL )
+    if ( remove )
     {
-        memguard_unguard_stack(stack_base[cpu]);
-        free_xenheap_pages(stack_base[cpu], STACK_ORDER);
-        stack_base[cpu] = NULL;
+        order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
+        FREE_XENHEAP_PAGES(idt_tables[cpu], order);
+
+        if ( stack_base[cpu] )
+        {
+            memguard_unguard_stack(stack_base[cpu]);
+            FREE_XENHEAP_PAGES(stack_base[cpu], STACK_ORDER);
+        }
     }
 }

@@ -955,15 +978,17 @@ static int cpu_smpboot_alloc(unsigned int cpu)
     if ( node != NUMA_NO_NODE )
         memflags = MEMF_node(node);

-    stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
+    if ( stack_base[cpu] == NULL )
+        stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
     if ( stack_base[cpu] == NULL )
         goto out;
     memguard_guard_stack(stack_base[cpu]);

     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
-    per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
+    gdt = per_cpu(gdt_table, cpu) ?: alloc_xenheap_pages(order, memflags);
     if ( gdt == NULL )
         goto out;
+    per_cpu(gdt_table, cpu) = gdt;
     memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
     BUILD_BUG_ON(NR_CPUS > 0x10000);
     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
@@ -975,7 +1000,8 @@ static int cpu_smpboot_alloc(unsigned int cpu)
     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;

     order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
-    idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
+    if ( idt_tables[cpu] == NULL )
+        idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
     if ( idt_tables[cpu] == NULL )
         goto out;
     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
@@ -1003,16 +1029,16 @@ static int cpu_smpboot_alloc(unsigned int cpu)
          (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
         goto out;

-    if ( !(zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
-           zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
-           alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) )
+    if ( !(cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
+           cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
+           cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) )
         goto out;

     rc = 0;

  out:
     if ( rc )
-        cpu_smpboot_free(cpu);
+        cpu_smpboot_free(cpu, true);

     return rc;
 }
@@ -1030,9 +1056,10 @@ static int cpu_smpboot_callback(
         break;
     case CPU_UP_CANCELED:
     case CPU_DEAD:
-        cpu_smpboot_free(cpu);
+        cpu_smpboot_free(cpu, !park_offline_cpus);
         break;
-    default:
+    case CPU_REMOVE:
+        cpu_smpboot_free(cpu, true);
         break;
     }

diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 08e6784c4c..f0c50d6703 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -19,10 +19,13 @@
 #include <xen/errno.h>
 #include <xen/init.h>
 #include <xen/lib.h>
+#include <xen/warning.h>

 #include <asm/microcode.h>
 #include <asm/msr.h>
 #include <asm/processor.h>
+#include <asm/pv/shim.h>
+#include <asm/setup.h>
 #include <asm/spec_ctrl.h>
 #include <asm/spec_ctrl_asm.h>

@@ -45,11 +48,16 @@ static int8_t __initdata opt_ibrs = -1;
 bool __read_mostly opt_ibpb = true;
 bool __read_mostly opt_ssbd = false;
 int8_t __read_mostly opt_eager_fpu = -1;
+int8_t __read_mostly opt_l1d_flush = -1;

 bool __initdata bsp_delay_spec_ctrl;
 uint8_t __read_mostly default_xen_spec_ctrl;
 uint8_t __read_mostly default_spec_ctrl_flags;

+paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr;
+static bool __initdata cpu_has_bug_l1tf;
+static unsigned int __initdata l1d_maxphysaddr;
+
 static int __init parse_bti(const char *s)
 {
     const char *ss;
@@ -124,6 +132,17 @@ static int __init parse_spec_ctrl(const char *s)
             opt_msr_sc_pv = false;
             opt_msr_sc_hvm = false;

+            opt_eager_fpu = 0;
+
+            if ( opt_xpti < 0 )
+                opt_xpti = 0;
+
+            if ( opt_smt < 0 )
+                opt_smt = 1;
+
+            if ( opt_pv_l1tf < 0 )
+                opt_pv_l1tf = 0;
+
         disable_common:
             opt_rsb_pv = false;
             opt_rsb_hvm = false;
@@ -131,7 +150,8 @@ static int __init parse_spec_ctrl(const char *s)
             opt_thunk = THUNK_JMP;
             opt_ibrs = 0;
             opt_ibpb = false;
-            opt_eager_fpu = 0;
+            opt_ssbd = false;
+            opt_l1d_flush = 0;
         }
         else if ( val > 0 )
             rc = -EINVAL;
@@ -187,6 +207,8 @@ static int __init parse_spec_ctrl(const char *s)
             opt_ssbd = val;
         else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
             opt_eager_fpu = val;
+        else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
+            opt_l1d_flush = val;
         else
             rc = -EINVAL;

@@ -197,6 +219,55 @@ static int __init parse_spec_ctrl(const char *s)
 }
 custom_param("spec-ctrl", parse_spec_ctrl);

+int8_t __read_mostly opt_pv_l1tf = -1;
+
+static __init int parse_pv_l1tf(const char *s)
+{
+    const char *ss;
+    int val, rc = 0;
+
+    /* Inhibit the defaults as an explicit choice has been given. */
+    if ( opt_pv_l1tf == -1 )
+        opt_pv_l1tf = 0;
+
+    /* Interpret 'pv-l1tf' alone in its positive boolean form. */
+    if ( *s == '\0' )
+        opt_xpti = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
+
+    do {
+        ss = strchr(s, ',');
+        if ( !ss )
+            ss = strchr(s, '\0');
+
+        switch ( parse_bool(s, ss) )
+        {
+        case 0:
+            opt_pv_l1tf = 0;
+            break;
+
+        case 1:
+            opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
+            break;
+
+        default:
+            if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
+                opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOM0) |
+                               (val ? OPT_PV_L1TF_DOM0 : 0));
+            else if ( (val = parse_boolean("domu", s, ss)) >= 0 )
+                opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOMU) |
+                               (val ? OPT_PV_L1TF_DOMU : 0));
+            else
+                rc = -EINVAL;
+            break;
+        }
+
+        s = ss + 1;
+    } while ( *ss );
+
+    return rc;
+}
+custom_param("pv-l1tf", parse_pv_l1tf);
+
 static void __init print_details(enum ind_thunk thunk, uint64_t caps)
 {
     unsigned int _7d0 = 0, e8b = 0, tmp;
@@ -210,22 +281,31 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
     printk("Speculative mitigation facilities:\n");

     /* Hardware features which pertain to speculative mitigations. */
-    printk("  Hardware features:%s%s%s%s%s%s%s%s\n",
+    printk("  Hardware features:%s%s%s%s%s%s%s%s%s%s\n",
            (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
            (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP"     : "",
+           (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
            (_7d0 & cpufeat_mask(X86_FEATURE_SSBD))  ? " SSBD"      : "",
            (e8b  & cpufeat_mask(X86_FEATURE_IBPB))  ? " IBPB"      : "",
            (caps & ARCH_CAPABILITIES_IBRS_ALL)      ? " IBRS_ALL"  : "",
            (caps & ARCH_CAPABILITIES_RDCL_NO)       ? " RDCL_NO"   : "",
            (caps & ARCH_CAPS_RSBA)                  ? " RSBA"      : "",
+           (caps & ARCH_CAPS_SKIP_L1DFL)            ? " SKIP_L1DFL": "",
            (caps & ARCH_CAPS_SSB_NO)                ? " SSB_NO"    : "");

-    /* Compiled-in support which pertains to BTI mitigations. */
-    if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) )
-        printk("  Compiled-in support: INDIRECT_THUNK\n");
+    /* Compiled-in support which pertains to mitigations. */
+    if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
+        printk("  Compiled-in support:"
+#ifdef CONFIG_INDIRECT_THUNK
+               " INDIRECT_THUNK"
+#endif
+#ifdef CONFIG_SHADOW_PAGING
+               " SHADOW_PAGING"
+#endif
+               "\n");

     /* Settings for Xen's protection, irrespective of guests. */
-    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s\n",
+    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s\n",
            thunk == THUNK_NONE      ? "N/A" :
            thunk == THUNK_RETPOLINE ? "RETPOLINE" :
            thunk == THUNK_LFENCE    ? "LFENCE" :
@@ -234,7 +314,15 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
            (default_xen_spec_ctrl & SPEC_CTRL_IBRS)  ? "IBRS+" :  "IBRS-",
            !boot_cpu_has(X86_FEATURE_SSBD)           ? "" :
            (default_xen_spec_ctrl & SPEC_CTRL_SSBD)  ? " SSBD+" : " SSBD-",
-           opt_ibpb                                  ? " IBPB"  : "");
+           opt_ibpb                                  ? " IBPB"  : "",
+           opt_l1d_flush                             ? " L1D_FLUSH" : "");
+
+    /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
+    if ( cpu_has_bug_l1tf || opt_pv_l1tf )
+        printk("  L1TF: believed%s vulnerable, maxphysaddr L1D %u, CPUID %u"
+               ", Safe address %"PRIx64"\n",
+               cpu_has_bug_l1tf ? "" : " not",
+               l1d_maxphysaddr, paddr_bits, l1tf_safe_maddr);

     /*
      * Alternatives blocks for protecting against and/or virtualising
@@ -257,6 +345,10 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
     printk("  XPTI (64-bit PV only): Dom0 %s, DomU %s\n",
            opt_xpti & OPT_XPTI_DOM0 ? "enabled" : "disabled",
            opt_xpti & OPT_XPTI_DOMU ? "enabled" : "disabled");
+
+    printk("  PV L1TF shadowing: Dom0 %s, DomU %s\n",
+           opt_pv_l1tf & OPT_PV_L1TF_DOM0  ? "enabled"  : "disabled",
+           opt_pv_l1tf & OPT_PV_L1TF_DOMU  ? "enabled"  : "disabled");
 }

 /* Calculate whether Retpoline is known-safe on this CPU. */
@@ -418,20 +510,159 @@ static bool __init should_use_eager_fpu(void)
     }
 }

-#define OPT_XPTI_DEFAULT  0xff
-uint8_t __read_mostly opt_xpti = OPT_XPTI_DEFAULT;
-
-static __init void xpti_init_default(bool force)
+/* Calculate whether this CPU is vulnerable to L1TF. */
+static __init void l1tf_calculations(uint64_t caps)
 {
-    uint64_t caps = 0;
+    bool hit_default = false;
+
+    l1d_maxphysaddr = paddr_bits;
+
+    /* L1TF is only known to affect Intel Family 6 processors at this time. */
+    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+         boot_cpu_data.x86 == 6 )
+    {
+        switch ( boot_cpu_data.x86_model )
+        {
+            /*
+             * Core processors since at least Penryn are vulnerable.
+             */
+        case 0x17: /* Penryn */
+        case 0x1d: /* Dunnington */
+            cpu_has_bug_l1tf = true;
+            break;
+
+        case 0x1f: /* Auburndale / Havendale */
+        case 0x1e: /* Nehalem */
+        case 0x1a: /* Nehalem EP */
+        case 0x2e: /* Nehalem EX */
+        case 0x25: /* Westmere */
+        case 0x2c: /* Westmere EP */
+        case 0x2f: /* Westmere EX */
+            cpu_has_bug_l1tf = true;
+            l1d_maxphysaddr = 44;
+            break;
+
+        case 0x2a: /* SandyBridge */
+        case 0x2d: /* SandyBridge EP/EX */
+        case 0x3a: /* IvyBridge */
+        case 0x3e: /* IvyBridge EP/EX */
+        case 0x3c: /* Haswell */
+        case 0x3f: /* Haswell EX/EP */
+        case 0x45: /* Haswell D */
+        case 0x46: /* Haswell H */
+        case 0x3d: /* Broadwell */
+        case 0x47: /* Broadwell H */
+        case 0x4f: /* Broadwell EP/EX */
+        case 0x56: /* Broadwell D */
+        case 0x4e: /* Skylake M */
+        case 0x55: /* Skylake X */
+        case 0x5e: /* Skylake D */
+        case 0x66: /* Cannonlake */
+        case 0x67: /* Cannonlake? */
+        case 0x8e: /* Kabylake M */
+        case 0x9e: /* Kabylake D */
+            cpu_has_bug_l1tf = true;
+            l1d_maxphysaddr = 46;
+            break;
+
+            /*
+             * Atom processors are not vulnerable.
+             */
+        case 0x1c: /* Pineview */
+        case 0x26: /* Lincroft */
+        case 0x27: /* Penwell */
+        case 0x35: /* Cloverview */
+        case 0x36: /* Cedarview */
+        case 0x37: /* Baytrail / Valleyview (Silvermont) */
+        case 0x4d: /* Avaton / Rangely (Silvermont) */
+        case 0x4c: /* Cherrytrail / Brasswell */
+        case 0x4a: /* Merrifield */
+        case 0x5a: /* Moorefield */
+        case 0x5c: /* Goldmont */
+        case 0x5f: /* Denverton */
+        case 0x7a: /* Gemini Lake */
+            break;
+
+            /*
+             * Knights processors are not vulnerable.
+             */
+        case 0x57: /* Knights Landing */
+        case 0x85: /* Knights Mill */
+            break;
+
+        default:
+            /* Defer printk() until we've accounted for RDCL_NO. */
+            hit_default = true;
+            cpu_has_bug_l1tf = true;
+            break;
+        }
+    }
+
+    /* Any processor advertising RDCL_NO should be not vulnerable to L1TF. */
+    if ( caps & ARCH_CAPABILITIES_RDCL_NO )
+        cpu_has_bug_l1tf = false;
+
+    if ( cpu_has_bug_l1tf && hit_default )
+        printk("Unrecognised CPU model %#x - assuming vulnerable to L1TF\n",
+               boot_cpu_data.x86_model);
+
+    /*
+     * L1TF safe address heuristics.  These apply to the real hardware we are
+     * running on, and are best-effort-only if Xen is virtualised.
+     *
+     * The address mask which the L1D cache uses, which might be wider than
+     * the CPUID-reported maxphysaddr.
+     */
+    l1tf_addr_mask = ((1ul << l1d_maxphysaddr) - 1) & PAGE_MASK;
+
+    /*
+     * To be safe, l1tf_safe_maddr must be above the highest cacheable entity
+     * in system physical address space.  However, to preserve space for
+     * paged-out metadata, it should be as low as possible above the highest
+     * cacheable address, so as to require fewer high-order bits being set.
+     *
+     * These heuristics are based on some guesswork to improve the likelihood
+     * of safety in the common case, including Linux's L1TF mitigation of
+     * inverting all address bits in a non-present PTE.
+     *
+     * - If L1D is wider than CPUID (Nehalem and later mobile/desktop/low end
+     *   server), setting any address bit beyond CPUID maxphysaddr guarantees
+     *   to make the PTE safe.  This case doesn't require all the high-order
+     *   bits being set, and doesn't require any other source of information
+     *   for safety.
+     *
+     * - If L1D is the same as CPUID (Pre-Nehalem, or high end server), we
+     *   must sacrifice high order bits from the real address space for
+     *   safety.  Therefore, make a blind guess that there is nothing
+     *   cacheable in the top quarter of physical address space.
+     *
+     *   It is exceedingly unlikely for machines to be populated with this
+     *   much RAM (likely 512G on pre-Nehalem, 16T on Nehalem/Westmere, 64T on
+     *   Sandybridge and later) due to the sheer volume of DIMMs this would
+     *   actually take.
+     *
+     *   However, it is possible to find machines this large, so the "top
+     *   quarter" guess is supplemented to push the limit higher if references
+     *   to cacheable mappings (E820/SRAT/EFI/etc) are found above the top
+     *   quarter boundary.
+     *
+     *   Finally, this top quarter guess gives us a good chance of being safe
+     *   when running virtualised (and the CPUID maxphysaddr hasn't been
+     *   levelled for heterogeneous migration safety), where the safety
+     *   consideration is still in terms of host details, but all E820/etc
+     *   information is in terms of guest physical layout.
+     */
+    l1tf_safe_maddr = max(l1tf_safe_maddr, ((l1d_maxphysaddr > paddr_bits)
+                                            ? (1ul << paddr_bits)
+                                            : (3ul << (paddr_bits - 2))));
+}

-    if ( !force && (opt_xpti != OPT_XPTI_DEFAULT) )
-        return;
+int8_t __read_mostly opt_xpti = -1;

+static __init void xpti_init_default(uint64_t caps)
+{
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
         caps = ARCH_CAPABILITIES_RDCL_NO;
-    else if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
-        rdmsrl(MSR_ARCH_CAPABILITIES, caps);

     if ( caps & ARCH_CAPABILITIES_RDCL_NO )
         opt_xpti = 0;
@@ -444,7 +675,13 @@ static __init int parse_xpti(const char *s)
     const char *ss;
     int val, rc = 0;

-    xpti_init_default(false);
+    /* Inhibit the defaults as an explicit choice has been given. */
+    if ( opt_xpti == -1 )
+        opt_xpti = 0;
+
+    /* Interpret 'xpti' alone in its positive boolean form. */
+    if ( *s == '\0' )
+        opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU;

     do {
         ss = strchr(s, ',');
@@ -463,7 +700,7 @@ static __init int parse_xpti(const char *s)

         default:
             if ( !strcmp(s, "default") )
-                xpti_init_default(true);
+                opt_xpti = -1;
             else if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
                 opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) |
                            (val ? OPT_XPTI_DOM0 : 0);
@@ -625,12 +862,58 @@ void __init init_speculation_mitigations(void)
     if ( default_xen_spec_ctrl )
         setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE);

-    xpti_init_default(false);
+    if ( opt_xpti == -1 )
+        xpti_init_default(caps);
+
     if ( opt_xpti == 0 )
         setup_force_cpu_cap(X86_FEATURE_NO_XPTI);
     else
         setup_clear_cpu_cap(X86_FEATURE_NO_XPTI);

+    l1tf_calculations(caps);
+
+    /*
+     * By default, enable PV domU L1TF mitigations on all L1TF-vulnerable
+     * hardware, except when running in shim mode.
+     *
+     * In shim mode, SHADOW is expected to be compiled out, and a malicious
+     * guest kernel can only attack the shim Xen, not the host Xen.
+     */
+    if ( opt_pv_l1tf == -1 )
+    {
+        if ( pv_shim || !cpu_has_bug_l1tf )
+            opt_pv_l1tf = 0;
+        else
+            opt_pv_l1tf = OPT_PV_L1TF_DOMU;
+    }
+
+    /*
+     * By default, enable L1D_FLUSH on L1TF-vulnerable hardware, unless
+     * instructed to skip the flush on vmentry by our outer hypervisor.
+     */
+    if ( !boot_cpu_has(X86_FEATURE_L1D_FLUSH) )
+        opt_l1d_flush = 0;
+    else if ( opt_l1d_flush == -1 )
+        opt_l1d_flush = cpu_has_bug_l1tf && !(caps & ARCH_CAPS_SKIP_L1DFL);
+
+    /*
+     * We do not disable HT by default on affected hardware.
+     *
+     * Firstly, if the user intends to use exclusively PV, or HVM shadow
+     * guests, HT isn't a concern and should remain fully enabled.  Secondly,
+     * safety for HVM HAP guests can be arranged by the toolstack with core
+     * parking, pinning or cpupool configurations, including mixed setups.
+     *
+     * However, if we are on affected hardware, with HT enabled, and the user
+     * hasn't explicitly chosen whether to use HT or not, nag them to do so.
+     */
+    if ( opt_smt == -1 && cpu_has_bug_l1tf && !pv_shim &&
+         boot_cpu_data.x86_num_siblings > 1 )
+        warning_add(
+            "Booted on L1TF-vulnerable hardware with SMT/Hyperthreading\n"
+            "enabled.  Please assess your configuration and choose an\n"
+            "explicit 'smt=<bool>' setting.  See XSA-273.\n");
+
     print_details(thunk, caps);

     /*
diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
index 166eb44fe2..2d70b45909 100644
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -20,6 +20,7 @@
 #include <xen/pfn.h>
 #include <asm/e820.h>
 #include <asm/page.h>
+#include <asm/spec_ctrl.h>

 static struct acpi_table_slit *__read_mostly acpi_slit;

@@ -284,6 +285,11 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
 	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
 		return;

+	start = ma->base_address;
+	end = start + ma->length;
+	/* Supplement the heuristics in l1tf_calculations(). */
+	l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE));
+
 	if (num_node_memblks >= NR_NODE_MEMBLKS)
 	{
 		dprintk(XENLOG_WARNING,
@@ -292,8 +298,6 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
 		return;
 	}

-	start = ma->base_address;
-	end = start + ma->length;
 	pxm = ma->proximity_domain;
 	if (srat_rev < 2)
 		pxm &= 0xff;
diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
index 4d372db12b..e704ed7f1c 100644
--- a/xen/arch/x86/sysctl.c
+++ b/xen/arch/x86/sysctl.c
@@ -23,6 +23,7 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/processor.h>
+#include <asm/setup.h>
 #include <asm/smp.h>
 #include <asm/numa.h>
 #include <xen/nodemask.h>
@@ -48,14 +49,27 @@ static void l3_cache_get(void *arg)

 long cpu_up_helper(void *data)
 {
-    int cpu = (unsigned long)data;
+    unsigned int cpu = (unsigned long)data;
     int ret = cpu_up(cpu);
+
     if ( ret == -EBUSY )
     {
         /* On EBUSY, flush RCU work and have one more go. */
         rcu_barrier();
         ret = cpu_up(cpu);
     }
+
+    if ( !ret && !opt_smt &&
+         cpu_data[cpu].compute_unit_id == INVALID_CUID &&
+         cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) > 1 )
+    {
+        ret = cpu_down_helper(data);
+        if ( ret )
+            printk("Could not re-offline CPU%u (%d)\n", cpu, ret);
+        else
+            ret = -EPERM;
+    }
+
     return ret;
 }

diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 9f045a2045..789d7ff8cd 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -96,8 +96,6 @@ string_param("nmi", opt_nmi);
 DEFINE_PER_CPU(uint64_t, efer);
 static DEFINE_PER_CPU(unsigned long, last_extable_addr);

-DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
-
 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table);
 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table);

@@ -117,6 +115,9 @@ integer_param("debug_stack_lines", debug_stack_lines);
 static bool opt_ler;
 boolean_param("ler", opt_ler);

+/* LastExceptionFromIP on this hardware.  Zero if LER is not in use. */
+unsigned int __read_mostly ler_msr;
+
 #define stack_words_per_line 4
 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)

@@ -1778,17 +1779,6 @@ void do_device_not_available(struct cpu_user_regs *regs)
     return;
 }

-static void ler_enable(void)
-{
-    u64 debugctl;
-
-    if ( !this_cpu(ler_msr) )
-        return;
-
-    rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-    wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR);
-}
-
 void do_debug(struct cpu_user_regs *regs)
 {
     unsigned long dr6;
@@ -1821,6 +1811,10 @@ void do_debug(struct cpu_user_regs *regs)
      */
     write_debugreg(6, X86_DR6_DEFAULT);

+    /* #DB automatically disabled LBR.  Reinstate it if debugging Xen. */
+    if ( cpu_has_xen_lbr )
+        wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
+
     if ( !guest_mode(regs) )
     {
         /*
@@ -1838,7 +1832,7 @@ void do_debug(struct cpu_user_regs *regs)
             {
                 if ( regs->rip == (unsigned long)sysenter_eflags_saved )
                     regs->eflags &= ~X86_EFLAGS_TF;
-                goto out;
+                return;
             }
             if ( !debugger_trap_fatal(TRAP_debug, regs) )
             {
@@ -1895,20 +1889,14 @@ void do_debug(struct cpu_user_regs *regs)
                 regs->cs, _p(regs->rip), _p(regs->rip),
                 regs->ss, _p(regs->rsp), dr6);

-        goto out;
+        return;
     }

     /* Save debug status register where guest OS can peek at it */
     v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT);
     v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT);

-    ler_enable();
     pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
-    return;
-
- out:
-    ler_enable();
-    return;
 }

 static void __init noinline __set_intr_gate(unsigned int n,
@@ -1952,38 +1940,46 @@ void load_TR(void)
         : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
 }

-void percpu_traps_init(void)
+static unsigned int calc_ler_msr(void)
 {
-    subarch_percpu_traps_init();
-
-    if ( !opt_ler )
-        return;
-
     switch ( boot_cpu_data.x86_vendor )
     {
     case X86_VENDOR_INTEL:
         switch ( boot_cpu_data.x86 )
         {
         case 6:
-            this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
-            break;
+            return MSR_IA32_LASTINTFROMIP;
+
         case 15:
-            this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
-            break;
+            return MSR_P4_LER_FROM_LIP;
         }
         break;
+
     case X86_VENDOR_AMD:
         switch ( boot_cpu_data.x86 )
         {
         case 6:
         case 0xf ... 0x17:
-            this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
-            break;
+            return MSR_IA32_LASTINTFROMIP;
         }
         break;
     }

-    ler_enable();
+    return 0;
+}
+
+void percpu_traps_init(void)
+{
+    subarch_percpu_traps_init();
+
+    if ( !opt_ler )
+        return;
+
+    if ( !ler_msr && (ler_msr = calc_ler_msr()) )
+        setup_force_cpu_cap(X86_FEATURE_XEN_LBR);
+
+    if ( cpu_has_xen_lbr )
+        wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
 }

 void __init init_idt_traps(void)
diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c
index c2aa6f2fdb..02bc75b91e 100644
--- a/xen/arch/x86/x86_64/compat/mm.c
+++ b/xen/arch/x86/x86_64/compat/mm.c
@@ -163,19 +163,6 @@ int compat_arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     return rc;
 }

-int compat_update_va_mapping(unsigned int va, u32 lo, u32 hi,
-                             unsigned int flags)
-{
-    return do_update_va_mapping(va, lo | ((u64)hi << 32), flags);
-}
-
-int compat_update_va_mapping_otherdomain(unsigned long va, u32 lo, u32 hi,
-                                         unsigned long flags,
-                                         domid_t domid)
-{
-    return do_update_va_mapping_otherdomain(va, lo | ((u64)hi << 32), flags, domid);
-}
-
 DEFINE_XEN_GUEST_HANDLE(mmuext_op_compat_t);

 int compat_mmuext_op(XEN_GUEST_HANDLE_PARAM(void) arg,
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index f7f6928d70..b0401850ef 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -144,11 +144,12 @@ void show_registers(const struct cpu_user_regs *regs)
     printk("CPU:    %d\n", smp_processor_id());
     _show_registers(&fault_regs, fault_crs, context, v);

-    if ( this_cpu(ler_msr) && !guest_mode(regs) )
+    if ( ler_msr && !guest_mode(regs) )
     {
         u64 from, to;
-        rdmsrl(this_cpu(ler_msr), from);
-        rdmsrl(this_cpu(ler_msr) + 1, to);
+
+        rdmsrl(ler_msr, from);
+        rdmsrl(ler_msr + 1, to);
         printk("ler: %016lx -> %016lx\n", from, to);
     }
 }
diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c
index b4aea4b50a..15edd5df96 100644
--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -670,12 +670,17 @@ static bool valid_xcr0(u64 xcr0)
     return !(xcr0 & X86_XCR0_BNDREGS) == !(xcr0 & X86_XCR0_BNDCSR);
 }

-int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr)
+int validate_xstate(const struct domain *d, uint64_t xcr0, uint64_t xcr0_accum,
+                    const struct xsave_hdr *hdr)
 {
+    const struct cpuid_policy *cp = d->arch.cpuid;
+    uint64_t xcr0_max =
+        ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low;
     unsigned int i;

     if ( (hdr->xstate_bv & ~xcr0_accum) ||
          (xcr0 & ~xcr0_accum) ||
+         (xcr0_accum & ~xcr0_max) ||
          !valid_xcr0(xcr0) ||
          !valid_xcr0(xcr0_accum) )
         return -EINVAL;
@@ -694,20 +699,40 @@ int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr)
 int handle_xsetbv(u32 index, u64 new_bv)
 {
     struct vcpu *curr = current;
+    const struct cpuid_policy *cp = curr->domain->arch.cpuid;
+    uint64_t xcr0_max =
+        ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low;
     u64 mask;

     if ( index != XCR_XFEATURE_ENABLED_MASK )
         return -EOPNOTSUPP;

-    if ( (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) )
+    /*
+     * The CPUID logic shouldn't be able to hand out an XCR0 exceeding Xen's
+     * maximum features, but keep the check for robustness.
+     */
+    if ( unlikely(xcr0_max & ~xfeature_mask) )
+    {
+        gprintk(XENLOG_ERR,
+                "xcr0_max %016" PRIx64 " exceeds hardware max %016" PRIx64 "\n",
+                xcr0_max, xfeature_mask);
+        domain_crash(curr->domain);
+
+        return -EINVAL;
+    }
+
+    if ( (new_bv & ~xcr0_max) || !valid_xcr0(new_bv) )
         return -EINVAL;

-    /* XCR0.PKRU is disabled on PV mode. */
-    if ( is_pv_vcpu(curr) && (new_bv & X86_XCR0_PKRU) )
-        return -EOPNOTSUPP;
+    /* By this point, new_bv really should be accepted by hardware. */
+    if ( unlikely(!set_xcr0(new_bv)) )
+    {
+        gprintk(XENLOG_ERR, "new_bv %016" PRIx64 " rejected by hardware\n",
+                new_bv);
+        domain_crash(curr->domain);

-    if ( !set_xcr0(new_bv) )
         return -EFAULT;
+    }

     mask = new_bv & ~curr->arch.xcr0_accum;
     curr->arch.xcr0 = new_bv;
diff --git a/xen/common/cpu.c b/xen/common/cpu.c
index 6350f150bd..653a56b840 100644
--- a/xen/common/cpu.c
+++ b/xen/common/cpu.c
@@ -67,12 +67,17 @@ void __init register_cpu_notifier(struct notifier_block *nb)
     spin_unlock(&cpu_add_remove_lock);
 }

-static int take_cpu_down(void *unused)
+static void _take_cpu_down(void *unused)
 {
     void *hcpu = (void *)(long)smp_processor_id();
     int notifier_rc = notifier_call_chain(&cpu_chain, CPU_DYING, hcpu, NULL);
     BUG_ON(notifier_rc != NOTIFY_DONE);
     __cpu_disable();
+}
+
+static int take_cpu_down(void *arg)
+{
+    _take_cpu_down(arg);
     return 0;
 }

@@ -98,7 +103,9 @@ int cpu_down(unsigned int cpu)
         goto fail;
     }

-    if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
+    if ( unlikely(system_state < SYS_STATE_active) )
+        on_selected_cpus(cpumask_of(cpu), _take_cpu_down, NULL, true);
+    else if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
         goto fail;

     __cpu_die(cpu);
diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
index 999839444e..1e8edcbd57 100644
--- a/xen/common/cpupool.c
+++ b/xen/common/cpupool.c
@@ -490,7 +490,7 @@ static int cpupool_cpu_add(unsigned int cpu)
     cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
     cpumask_set_cpu(cpu, &cpupool_free_cpus);

-    if ( system_state == SYS_STATE_resume )
+    if ( system_state == SYS_STATE_suspend || system_state == SYS_STATE_resume )
     {
         struct cpupool **c;

@@ -522,6 +522,7 @@ static int cpupool_cpu_add(unsigned int cpu)
          * (or unplugging would have failed) and that is the default behavior
          * anyway.
          */
+        per_cpu(cpupool, cpu) = NULL;
         ret = cpupool_assign_cpu_locked(cpupool0, cpu);
     }
  out:
diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
index 64d12685d3..6be0b3986f 100644
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -1304,6 +1304,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)

 #ifndef CONFIG_ARM /* TODO - runtime service support */

+#include <asm/spec_ctrl.h>
+
 static bool __initdata efi_map_uc;

 static int __init parse_efi_param(const char *s)
@@ -1419,6 +1421,16 @@ void __init efi_init_memory(void)
                desc->PhysicalStart, desc->PhysicalStart + len - 1,
                desc->Type, desc->Attribute);

+        if ( (desc->Attribute & (EFI_MEMORY_WB | EFI_MEMORY_WT)) ||
+             (efi_bs_revision >= EFI_REVISION(2, 5) &&
+              (desc->Attribute & EFI_MEMORY_WP)) )
+        {
+            /* Supplement the heuristics in l1tf_calculations(). */
+            l1tf_safe_maddr =
+                max(l1tf_safe_maddr,
+                    ROUNDUP(desc->PhysicalStart + len, PAGE_SIZE));
+        }
+
         if ( !efi_enabled(EFI_RS) ||
              (!(desc->Attribute & EFI_MEMORY_RUNTIME) &&
               (!map_bs ||
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index c757b7f6f5..231ecf509a 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -97,7 +97,11 @@ static unsigned int __read_mostly max_maptrack_frames =
                                                DEFAULT_MAX_MAPTRACK_FRAMES;
 integer_runtime_param("gnttab_max_maptrack_frames", max_maptrack_frames);

-static unsigned int __read_mostly opt_gnttab_max_version = 2;
+#ifndef GNTTAB_MAX_VERSION
+#define GNTTAB_MAX_VERSION 2
+#endif
+
+static unsigned int __read_mostly opt_gnttab_max_version = GNTTAB_MAX_VERSION;
 static bool __read_mostly opt_transitive_grants = true;

 static int __init parse_gnttab(const char *s)
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 20ee1e4897..02aeed7c47 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1426,7 +1426,7 @@ static void free_heap_pages(

             page_list_del(predecessor, &heap(node, zone, order));

-            /* Keep predecessor's first_dirty if it is already set. */
+            /* Update predecessor's first_dirty if necessary. */
             if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
                  pg->u.free.first_dirty != INVALID_DIRTY_IDX )
                 predecessor->u.free.first_dirty = (1U << order) +
@@ -1447,6 +1447,12 @@ static void free_heap_pages(

             check_and_stop_scrub(successor);

+            /* Update pg's first_dirty if necessary. */
+            if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX &&
+                 successor->u.free.first_dirty != INVALID_DIRTY_IDX )
+                pg->u.free.first_dirty = (1U << order) +
+                                         successor->u.free.first_dirty;
+
             page_list_del(successor, &heap(node, zone, order));
         }

diff --git a/xen/common/tasklet.c b/xen/common/tasklet.c
index 0f0a6f8365..d4fea3151c 100644
--- a/xen/common/tasklet.c
+++ b/xen/common/tasklet.c
@@ -156,6 +156,10 @@ void tasklet_kill(struct tasklet *t)

     spin_lock_irqsave(&tasklet_lock, flags);

+    /* Cope with uninitialised tasklets. */
+    if ( list_head_is_null(&t->list) )
+        goto unlock;
+
     if ( !list_empty(&t->list) )
     {
         BUG_ON(t->is_dead || t->is_running || (t->scheduled_on < 0));
@@ -172,6 +176,7 @@ void tasklet_kill(struct tasklet *t)
         spin_lock_irqsave(&tasklet_lock, flags);
     }

+ unlock:
     spin_unlock_irqrestore(&tasklet_lock, flags);
 }

diff --git a/xen/include/asm-arm/arm32/system.h b/xen/include/asm-arm/arm32/system.h
index c617b40438..ab57abfbc5 100644
--- a/xen/include/asm-arm/arm32/system.h
+++ b/xen/include/asm-arm/arm32/system.h
@@ -48,6 +48,24 @@ static inline int local_fiq_is_enabled(void)
     return !(flags & PSR_FIQ_MASK);
 }

+#define CSDB    ".inst  0xe320f014"
+
+static inline unsigned long array_index_mask_nospec(unsigned long idx,
+                                                    unsigned long sz)
+{
+    unsigned long mask;
+
+    asm volatile( "cmp    %1, %2\n"
+                  "sbc    %0, %1, %1\n"
+                  CSDB
+                  : "=r" (mask)
+                  : "r" (idx), "Ir" (sz)
+                  : "cc" );
+
+    return mask;
+}
+#define array_index_mask_nospec array_index_mask_nospec
+
 #endif
 /*
  * Local variables:
diff --git a/xen/include/asm-arm/arm64/system.h b/xen/include/asm-arm/arm64/system.h
index 2e2ee212a1..2e36573ac6 100644
--- a/xen/include/asm-arm/arm64/system.h
+++ b/xen/include/asm-arm/arm64/system.h
@@ -58,6 +58,28 @@ static inline int local_fiq_is_enabled(void)
     return !(flags & PSR_FIQ_MASK);
 }

+#define csdb()  asm volatile ( "hint #20" : : : "memory" )
+
+/*
+ * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz
+ * and 0 otherwise.
+ */
+static inline unsigned long array_index_mask_nospec(unsigned long idx,
+                                                    unsigned long sz)
+{
+    unsigned long mask;
+
+    asm volatile ( "cmp     %1, %2\n"
+                   "sbc     %0, xzr, xzr\n"
+                   : "=r" (mask)
+                   : "r" (idx), "Ir" (sz)
+                   : "cc" );
+    csdb();
+
+    return mask;
+}
+#define array_index_mask_nospec array_index_mask_nospec
+
 #endif
 /*
  * Local variables:
diff --git a/xen/include/asm-arm/grant_table.h b/xen/include/asm-arm/grant_table.h
index e52936c79f..24958e4670 100644
--- a/xen/include/asm-arm/grant_table.h
+++ b/xen/include/asm-arm/grant_table.h
@@ -7,6 +7,7 @@
 #include <xen/sched.h>

 #define INITIAL_NR_GRANT_FRAMES 1U
+#define GNTTAB_MAX_VERSION 1

 struct grant_table_arch {
     gfn_t *shared_gfn;
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index 2cf8f7ea2a..b237da165c 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -113,6 +113,7 @@
 #define cpu_has_aperfmperf      boot_cpu_has(X86_FEATURE_APERFMPERF)
 #define cpu_has_lfence_dispatch boot_cpu_has(X86_FEATURE_LFENCE_DISPATCH)
 #define cpu_has_no_xpti         boot_cpu_has(X86_FEATURE_NO_XPTI)
+#define cpu_has_xen_lbr         boot_cpu_has(X86_FEATURE_XEN_LBR)

 enum _cache_type {
     CACHE_TYPE_NULL = 0,
diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
index b90aa2d046..8e5cc53dde 100644
--- a/xen/include/asm-x86/cpufeatures.h
+++ b/xen/include/asm-x86/cpufeatures.h
@@ -32,3 +32,4 @@ XEN_CPUFEATURE(SC_RSB_PV,       (FSCAPINTS+0)*32+18) /* RSB overwrite needed for
 XEN_CPUFEATURE(SC_RSB_HVM,      (FSCAPINTS+0)*32+19) /* RSB overwrite needed for HVM */
 XEN_CPUFEATURE(NO_XPTI,         (FSCAPINTS+0)*32+20) /* XPTI mitigation not in use */
 XEN_CPUFEATURE(SC_MSR_IDLE,     (FSCAPINTS+0)*32+21) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */
+XEN_CPUFEATURE(XEN_LBR,         (FSCAPINTS+0)*32+22) /* Xen uses MSR_DEBUGCTL.LBR */
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index e0d413c7de..61e6900465 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -121,6 +121,11 @@ struct shadow_domain {

     /* Has this domain ever used HVMOP_pagetable_dying? */
     bool_t pagetable_dying_op;
+
+#ifdef CONFIG_PV
+    /* PV L1 Terminal Fault mitigation. */
+    struct tasklet pv_l1tf_tasklet;
+#endif /* CONFIG_PV */
 #endif
 };

@@ -257,6 +262,8 @@ struct pv_domain
     bool xpti;
     /* Use PCID feature? */
     bool pcid;
+    /* Mitigate L1TF with shadow/crashing? */
+    bool check_l1tf;

     /* map_domain_page() mapping cache. */
     struct mapcache_domain mapcache;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 06c3179cec..57e5098b99 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -130,10 +130,18 @@ struct arch_vmx_struct {
     uint64_t             sfmask;

     struct vmx_msr_bitmap *msr_bitmap;
-    unsigned int         msr_count;
+
+    /*
+     * Most accesses to the MSR host/guest load/save lists are in current
+     * context.  However, the data can be modified by toolstack/migration
+     * actions.  Remote access is only permitted for paused vcpus, and is
+     * protected under the domctl lock.
+     */
     struct vmx_msr_entry *msr_area;
-    unsigned int         host_msr_count;
     struct vmx_msr_entry *host_msr_area;
+    unsigned int         msr_load_count;
+    unsigned int         msr_save_count;
+    unsigned int         host_msr_count;

     unsigned long        eoi_exitmap_changed;
     DECLARE_BITMAP(eoi_exit_bitmap, NR_VECTORS);
@@ -149,7 +157,7 @@ struct arch_vmx_struct {
     /* Are we emulating rather than VMENTERing? */
     uint8_t              vmx_emulate;

-    uint8_t              lbr_fixup_enabled;
+    uint8_t              lbr_flags;

     /* Bitmask of segments that we can't safely use in virtual 8086 mode */
     uint16_t             vm86_segment_mask;
@@ -514,9 +522,6 @@ enum vmcs_field {

 #define VMCS_VPID_WIDTH 16

-#define VMX_GUEST_MSR 0
-#define VMX_HOST_MSR  1
-
 /* VM Instruction error numbers */
 enum vmx_insn_errno
 {
@@ -534,6 +539,67 @@ enum vmx_insn_errno
     VMX_INSN_FAIL_INVALID                  = ~0,
 };

+/* MSR load/save list infrastructure. */
+enum vmx_msr_list_type {
+    VMX_MSR_HOST,           /* MSRs loaded on VMExit.                   */
+    VMX_MSR_GUEST,          /* MSRs saved on VMExit, loaded on VMEntry. */
+    VMX_MSR_GUEST_LOADONLY, /* MSRs loaded on VMEntry only.             */
+};
+
+/**
+ * Add an MSR to an MSR list (inserting space for the entry if necessary), and
+ * set the MSRs value.
+ *
+ * It is undefined behaviour to try and insert the same MSR into both the
+ * GUEST and GUEST_LOADONLY list.
+ *
+ * May fail if unable to allocate memory for the list, or the total number of
+ * entries exceeds the memory allocated.
+ */
+int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
+                enum vmx_msr_list_type type);
+
+static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr, uint64_t val)
+{
+    return vmx_add_msr(v, msr, val, VMX_MSR_GUEST);
+}
+static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr,
+                                        uint64_t val)
+{
+    return vmx_add_msr(v, msr, val, VMX_MSR_HOST);
+}
+
+struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+                                   enum vmx_msr_list_type type);
+
+static inline int vmx_read_guest_msr(const struct vcpu *v, uint32_t msr,
+                                     uint64_t *val)
+{
+    const struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST);
+
+    if ( !ent )
+        return -ESRCH;
+
+    *val = ent->data;
+
+    return 0;
+}
+
+static inline int vmx_write_guest_msr(struct vcpu *v, uint32_t msr,
+                                      uint64_t val)
+{
+    struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST);
+
+    if ( !ent )
+        return -ESRCH;
+
+    ent->data = val;
+
+    return 0;
+}
+
+
+/* MSR intercept bitmap infrastructure. */
 enum vmx_msr_intercept_type {
     VMX_MSR_R  = 1,
     VMX_MSR_W  = 2,
@@ -544,10 +610,6 @@ void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr,
                              enum vmx_msr_intercept_type type);
 void vmx_set_msr_intercept(struct vcpu *v, unsigned int msr,
                            enum vmx_msr_intercept_type type);
-int vmx_read_guest_msr(u32 msr, u64 *val);
-int vmx_write_guest_msr(u32 msr, u64 val);
-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type);
-int vmx_add_msr(u32 msr, int type);
 void vmx_vmcs_switch(paddr_t from, paddr_t to);
 void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector);
 void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector);
@@ -562,15 +624,6 @@ void virtual_vmcs_vmwrite(const struct vcpu *, u32 encoding, u64 val);
 enum vmx_insn_errno virtual_vmcs_vmwrite_safe(const struct vcpu *v,
                                               u32 vmcs_encoding, u64 val);

-static inline int vmx_add_guest_msr(u32 msr)
-{
-    return vmx_add_msr(msr, VMX_GUEST_MSR);
-}
-static inline int vmx_add_host_load_msr(u32 msr)
-{
-    return vmx_add_msr(msr, VMX_HOST_MSR);
-}
-
 DECLARE_PER_CPU(bool_t, vmxon);

 bool_t vmx_vcpu_pml_enabled(const struct vcpu *v);
diff --git a/xen/include/asm-x86/hypercall.h b/xen/include/asm-x86/hypercall.h
index 1cc2e37d5c..da38b7991c 100644
--- a/xen/include/asm-x86/hypercall.h
+++ b/xen/include/asm-x86/hypercall.h
@@ -165,7 +165,7 @@ extern int compat_update_va_mapping(
     unsigned int va, u32 lo, u32 hi, unsigned int flags);

 extern int compat_update_va_mapping_otherdomain(
-    unsigned long va, u32 lo, u32 hi, unsigned long flags, domid_t domid);
+    unsigned int va, u32 lo, u32 hi, unsigned int flags, domid_t domid);

 DEFINE_XEN_GUEST_HANDLE(trap_info_compat_t);
 extern int compat_set_trap_table(XEN_GUEST_HANDLE(trap_info_compat_t) traps);
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
index 8fbccc88a7..7235623c86 100644
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -47,8 +47,12 @@
 #define ARCH_CAPABILITIES_RDCL_NO	(_AC(1, ULL) << 0)
 #define ARCH_CAPABILITIES_IBRS_ALL	(_AC(1, ULL) << 1)
 #define ARCH_CAPS_RSBA			(_AC(1, ULL) << 2)
+#define ARCH_CAPS_SKIP_L1DFL		(_AC(1, ULL) << 3)
 #define ARCH_CAPS_SSB_NO		(_AC(1, ULL) << 4)

+#define MSR_FLUSH_CMD			0x0000010b
+#define FLUSH_CMD_L1D			(_AC(1, ULL) << 0)
+
 /* Intel MSRs. Some also available on other CPUs */
 #define MSR_IA32_PERFCTR0		0x000000c1
 #define MSR_IA32_A_PERFCTR0		0x000004c1
diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
index f14f265aa5..afbeb7f155 100644
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -241,7 +241,7 @@ static inline void write_efer(uint64_t val)
     wrmsrl(MSR_EFER, val);
 }

-DECLARE_PER_CPU(u32, ler_msr);
+extern unsigned int ler_msr;

 DECLARE_PER_CPU(uint32_t, tsc_aux);

diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index f0085511c7..f440e3e53c 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -37,11 +37,14 @@

 #define PG_SH_shift    20
 #define PG_HAP_shift   21
+#define PG_SHF_shift   22
 /* We're in one of the shadow modes */
 #ifdef CONFIG_SHADOW_PAGING
 #define PG_SH_enable   (1U << PG_SH_shift)
+#define PG_SH_forced   (1U << PG_SHF_shift)
 #else
 #define PG_SH_enable   0
+#define PG_SH_forced   0
 #endif
 #define PG_HAP_enable  (1U << PG_HAP_shift)

@@ -62,6 +65,7 @@

 #define paging_mode_enabled(_d)   (!!(_d)->arch.paging.mode)
 #define paging_mode_shadow(_d)    (!!((_d)->arch.paging.mode & PG_SH_enable))
+#define paging_mode_sh_forced(_d) (!!((_d)->arch.paging.mode & PG_SH_forced))
 #define paging_mode_hap(_d)       (!!((_d)->arch.paging.mode & PG_HAP_enable))

 #define paging_mode_refcounts(_d) (!!((_d)->arch.paging.mode & PG_refcounts))
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 9924cdf1f3..2bd9e69684 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -337,12 +337,6 @@ static always_inline void set_in_cr4 (unsigned long mask)
     write_cr4(read_cr4() | mask);
 }

-static always_inline void clear_in_cr4 (unsigned long mask)
-{
-    mmu_cr4_features &= ~mask;
-    write_cr4(read_cr4() & ~mask);
-}
-
 static inline unsigned int read_pkru(void)
 {
     unsigned int pkru;
diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
index 19232afa01..c09a5ff381 100644
--- a/xen/include/asm-x86/setup.h
+++ b/xen/include/asm-x86/setup.h
@@ -66,6 +66,8 @@ extern uint8_t kbd_shift_flags;
 extern unsigned long highmem_start;
 #endif

+extern int8_t opt_smt;
+
 #ifdef CONFIG_SHADOW_PAGING
 extern bool opt_dom0_shadow;
 #else
diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
index 94a34fd16a..f40f411871 100644
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -29,6 +29,7 @@
 #include <asm/flushtlb.h>
 #include <asm/paging.h>
 #include <asm/p2m.h>
+#include <asm/spec_ctrl.h>

 /*****************************************************************************
  * Macros to tell which shadow paging mode a domain is in*/
@@ -115,6 +116,131 @@ static inline int shadow_domctl(struct domain *d,

 #endif /* CONFIG_SHADOW_PAGING */

+/*
+ * Mitigations for L1TF / CVE-2018-3620 for PV guests.
+ *
+ * We cannot alter an architecturally-legitimate PTE which a PV guest has
+ * chosen to write, as traditional paged-out metadata is L1TF-vulnerable.
+ * What we can do is force a PV guest which writes a vulnerable PTE into
+ * shadow mode, so Xen controls the pagetables which are reachable by the CPU
+ * pagewalk.
+ *
+ * The core of the L1TF vulnerability is that the address bits of the PTE
+ * (accounting for PSE and factoring in the level-relevant part of the linear
+ * access) are sent for an L1D lookup (to retrieve the next-level PTE, or
+ * eventual memory address) before the Present or reserved bits (which would
+ * cause a terminal fault) are accounted for.  If an L1D hit occurs, the
+ * resulting data is available for potentially dependent instructions.
+ *
+ * For Present PTEs, the PV type-count safety logic ensures that the address
+ * bits always point at a guest-accessible frame, which is safe WRT L1TF from
+ * Xen's point of view.  In practice, a PV guest should be unable to set any
+ * reserved bits, so should be unable to create any present L1TF-vulnerable
+ * PTEs at all.
+ *
+ * Therefore, these safety checks apply to Not-Present PTEs only, where
+ * traditionally, Xen would have let the guest write any value it chose.
+ *
+ * The all-zero PTE potentially leaks mfn 0.  All software on the system is
+ * expected to cooperate and not put any secrets there.  In a Xen system,
+ * neither Xen nor dom0 are expected to touch mfn 0, as it typically contains
+ * the real mode IVT and Bios Data Area.  Therefore, mfn 0 is considered safe.
+ *
+ * Any PTE whose address is higher than the maximum cacheable address is safe,
+ * as it won't get an L1D hit.
+ *
+ * Speculative superpages also need accounting for, as PSE is considered
+ * irrespective of Present.  We disallow PSE being set, as it allows an
+ * attacker to leak 2M or 1G of data starting from mfn 0.  Also, because of
+ * recursive/linear pagetables, we must consider PSE even at L4, as hardware
+ * will interpret an L4e as an L3e during a recursive walk.
+ */
+
+static inline bool is_l1tf_safe_maddr(intpte_t pte)
+{
+    paddr_t maddr = pte & l1tf_addr_mask;
+
+    return maddr == 0 || maddr >= l1tf_safe_maddr;
+}
+
+static inline bool pv_l1tf_check_pte(struct domain *d, unsigned int level,
+                                     intpte_t pte)
+{
+    ASSERT(is_pv_domain(d));
+    ASSERT(!(pte & _PAGE_PRESENT));
+
+    if ( d->arch.pv_domain.check_l1tf && !paging_mode_sh_forced(d) &&
+         (((level > 1) && (pte & _PAGE_PSE)) || !is_l1tf_safe_maddr(pte)) )
+    {
+#ifdef CONFIG_SHADOW_PAGING
+        struct tasklet *t = &d->arch.paging.shadow.pv_l1tf_tasklet;
+
+        printk(XENLOG_G_WARNING
+               "d%d L1TF-vulnerable L%ue %016"PRIx64" - Shadowing\n",
+               d->domain_id, level, pte);
+        /*
+         * Safety consideration for accessing tasklet.scheduled_on without the
+         * tasklet lock.  This is a singleshot tasklet with the side effect of
+         * setting PG_SH_forced (checked just above).  Multiple vcpus can race
+         * to schedule the tasklet, but if we observe it scheduled anywhere,
+         * that is good enough.
+         */
+        smp_rmb();
+        if ( !tasklet_is_scheduled(t) )
+            tasklet_schedule(t);
+#else
+        printk(XENLOG_G_ERR
+               "d%d L1TF-vulnerable L%ue %016"PRIx64" - Crashing\n",
+               d->domain_id, level, pte);
+        domain_crash(d);
+#endif
+        return true;
+    }
+
+    return false;
+}
+
+static inline bool pv_l1tf_check_l1e(struct domain *d, l1_pgentry_t l1e)
+{
+    return pv_l1tf_check_pte(d, 1, l1e.l1);
+}
+
+static inline bool pv_l1tf_check_l2e(struct domain *d, l2_pgentry_t l2e)
+{
+    return pv_l1tf_check_pte(d, 2, l2e.l2);
+}
+
+static inline bool pv_l1tf_check_l3e(struct domain *d, l3_pgentry_t l3e)
+{
+    return pv_l1tf_check_pte(d, 3, l3e.l3);
+}
+
+static inline bool pv_l1tf_check_l4e(struct domain *d, l4_pgentry_t l4e)
+{
+    return pv_l1tf_check_pte(d, 4, l4e.l4);
+}
+
+void pv_l1tf_tasklet(unsigned long data);
+
+static inline void pv_l1tf_domain_init(struct domain *d)
+{
+    d->arch.pv_domain.check_l1tf =
+        opt_pv_l1tf & (is_hardware_domain(d)
+                       ? OPT_PV_L1TF_DOM0 : OPT_PV_L1TF_DOMU);
+
+#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
+    tasklet_init(&d->arch.paging.shadow.pv_l1tf_tasklet,
+                 pv_l1tf_tasklet, (unsigned long)d);
+#endif
+}
+
+static inline void pv_l1tf_domain_destroy(struct domain *d)
+{
+#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
+    tasklet_kill(&d->arch.paging.shadow.pv_l1tf_tasklet);
+#endif
+}
+
 /* Remove all shadows of the guest mfn. */
 static inline void shadow_remove_all_shadows(struct domain *d, mfn_t gmfn)
 {
diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h
index 4e5f673fec..09c55458df 100644
--- a/xen/include/asm-x86/smp.h
+++ b/xen/include/asm-x86/smp.h
@@ -26,6 +26,8 @@ DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
 DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);

+extern bool park_offline_cpus;
+
 void smp_send_nmi_allbutself(void);

 void send_IPI_mask(const cpumask_t *, int vector);
diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
index 5b40afbab0..8f8aad40bb 100644
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -29,15 +29,27 @@ void init_speculation_mitigations(void);
 extern bool opt_ibpb;
 extern bool opt_ssbd;
 extern int8_t opt_eager_fpu;
+extern int8_t opt_l1d_flush;

 extern bool bsp_delay_spec_ctrl;
 extern uint8_t default_xen_spec_ctrl;
 extern uint8_t default_spec_ctrl_flags;

-extern uint8_t opt_xpti;
+extern int8_t opt_xpti;
 #define OPT_XPTI_DOM0  0x01
 #define OPT_XPTI_DOMU  0x02

+extern int8_t opt_pv_l1tf;
+#define OPT_PV_L1TF_DOM0  0x01
+#define OPT_PV_L1TF_DOMU  0x02
+
+/*
+ * The L1D address mask, which might be wider than reported in CPUID, and the
+ * system physical address above which there are believed to be no cacheable
+ * memory regions, thus unable to leak data via the L1TF vulnerability.
+ */
+extern paddr_t l1tf_addr_mask, l1tf_safe_maddr;
+
 static inline void init_shadow_spec_ctrl_state(void)
 {
     struct cpu_info *info = get_cpu_info();
diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h
index 43fb6fe489..483cd20afd 100644
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -221,6 +221,30 @@ static always_inline unsigned long __xadd(
 #define set_mb(var, value) do { xchg(&var, value); } while (0)
 #define set_wmb(var, value) do { var = value; smp_wmb(); } while (0)

+/**
+ * array_index_mask_nospec() - generate a mask that is ~0UL when the
+ *      bounds check succeeds and 0 otherwise
+ * @index: array element index
+ * @size: number of elements in array
+ *
+ * Returns:
+ *     0 - (index < size)
+ */
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+                                                    unsigned long size)
+{
+    unsigned long mask;
+
+    asm volatile ( "cmp %[size], %[index]; sbb %[mask], %[mask];"
+                   : [mask] "=r" (mask)
+                   : [size] "g" (size), [index] "r" (index) );
+
+    return mask;
+}
+
+/* Override default implementation in nospec.h. */
+#define array_index_mask_nospec array_index_mask_nospec
+
 #define local_irq_disable()     asm volatile ( "cli" : : : "memory" )
 #define local_irq_enable()      asm volatile ( "sti" : : : "memory" )

diff --git a/xen/include/asm-x86/xstate.h b/xen/include/asm-x86/xstate.h
index 86a4a1f75c..47f602b855 100644
--- a/xen/include/asm-x86/xstate.h
+++ b/xen/include/asm-x86/xstate.h
@@ -97,8 +97,9 @@ void xsave(struct vcpu *v, uint64_t mask);
 void xrstor(struct vcpu *v, uint64_t mask);
 void xstate_set_init(uint64_t mask);
 bool xsave_enabled(const struct vcpu *v);
-int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum,
-                                 const struct xsave_hdr *);
+int __must_check validate_xstate(const struct domain *d,
+                                 uint64_t xcr0, uint64_t xcr0_accum,
+                                 const struct xsave_hdr *hdr);
 int __must_check handle_xsetbv(u32 index, u64 new_bv);
 void expand_xsave_states(struct vcpu *v, void *dest, unsigned int size);
 void compress_xsave_states(struct vcpu *v, const void *src, unsigned int size);
diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
index f1a5ed93e0..6c82816fd3 100644
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -244,6 +244,7 @@ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A  AVX512 Neural Network Instructions *
 XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A  AVX512 Multiply Accumulation Single Precision */
 XEN_CPUFEATURE(IBRSB,         9*32+26) /*A  IBRS and IBPB support (used by Intel) */
 XEN_CPUFEATURE(STIBP,         9*32+27) /*A  STIBP */
+XEN_CPUFEATURE(L1D_FLUSH,     9*32+28) /*S  MSR_FLUSH_CMD and L1D flush. */
 XEN_CPUFEATURE(ARCH_CAPS,     9*32+29) /*   IA32_ARCH_CAPABILITIES MSR */
 XEN_CPUFEATURE(SSBD,          9*32+31) /*A  MSR_SPEC_CTRL.SSBD available */

diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h
index 533a8ea0f3..a7e05681c9 100644
--- a/xen/include/xen/compiler.h
+++ b/xen/include/xen/compiler.h
@@ -81,6 +81,9 @@
 #pragma GCC visibility push(hidden)
 #endif

+/* Make the optimizer believe the variable can be manipulated arbitrarily. */
+#define OPTIMIZER_HIDE_VAR(var) __asm__ ( "" : "+g" (var) )
+
 /* This macro obfuscates arithmetic on a variable address so that gcc
    shouldn't recognize the original var, and make assumptions about it */
 /*
diff --git a/xen/include/xen/cpu.h b/xen/include/xen/cpu.h
index ffefc09f8e..2fe3ec05d8 100644
--- a/xen/include/xen/cpu.h
+++ b/xen/include/xen/cpu.h
@@ -47,6 +47,8 @@ void register_cpu_notifier(struct notifier_block *nb);
 #define CPU_DYING        (0x0007 | NOTIFY_REVERSE)
 /* CPU_DEAD: CPU is dead. */
 #define CPU_DEAD         (0x0008 | NOTIFY_REVERSE)
+/* CPU_REMOVE: CPU was removed. */
+#define CPU_REMOVE       (0x0009 | NOTIFY_REVERSE)

 /* Perform CPU hotplug. May return -EAGAIN. */
 int cpu_down(unsigned int cpu);
diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h
index 42340a098e..4a11bcc3f3 100644
--- a/xen/include/xen/cpumask.h
+++ b/xen/include/xen/cpumask.h
@@ -351,16 +351,35 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
 	return *mask != NULL;
 }

+static inline bool cond_alloc_cpumask_var(cpumask_var_t *mask)
+{
+	if (*mask == NULL)
+		*mask = _xmalloc(nr_cpumask_bits / 8, sizeof(long));
+	return *mask != NULL;
+}
+
 static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
 {
 	*(void **)mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
 	return *mask != NULL;
 }

+static inline bool cond_zalloc_cpumask_var(cpumask_var_t *mask)
+{
+	if (*mask == NULL)
+		*mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
+	else
+		cpumask_clear(*mask);
+	return *mask != NULL;
+}
+
 static inline void free_cpumask_var(cpumask_var_t mask)
 {
 	xfree(mask);
 }
+
+/* Free an allocated mask, and zero the pointer to it. */
+#define FREE_CPUMASK_VAR(m) XFREE(m)
 #else
 typedef cpumask_t cpumask_var_t[1];

@@ -368,16 +387,20 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
 {
 	return 1;
 }
+#define cond_alloc_cpumask_var alloc_cpumask_var

 static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
 {
 	cpumask_clear(*mask);
 	return 1;
 }
+#define cond_zalloc_cpumask_var zalloc_cpumask_var

 static inline void free_cpumask_var(cpumask_var_t mask)
 {
 }
+
+#define FREE_CPUMASK_VAR(m) free_cpumask_var(m)
 #endif

 #if NR_CPUS > 1
diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h
index fa07d720ee..1387abb211 100644
--- a/xen/include/xen/list.h
+++ b/xen/include/xen/list.h
@@ -51,6 +51,11 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
     list->prev = list;
 }

+static inline bool list_head_is_null(const struct list_head *list)
+{
+    return !list->next && !list->prev;
+}
+
 /*
  * Insert a new entry between two known consecutive entries.
  *
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index e928551c91..24654e8e22 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -162,6 +162,14 @@ void free_xenheap_pages(void *v, unsigned int order);
 bool scrub_free_pages(void);
 #define alloc_xenheap_page() (alloc_xenheap_pages(0,0))
 #define free_xenheap_page(v) (free_xenheap_pages(v,0))
+
+/* Free an allocation, and zero the pointer to it. */
+#define FREE_XENHEAP_PAGES(p, o) do { \
+    free_xenheap_pages(p, o);         \
+    (p) = NULL;                       \
+} while ( false )
+#define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0)
+
 /* Map machine page range in Xen virtual address space. */
 int map_pages_to_xen(
     unsigned long virt,
diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h
new file mode 100644
index 0000000000..48793996e8
--- /dev/null
+++ b/xen/include/xen/nospec.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2018 Linus Torvalds. All rights reserved. */
+/* Copyright(c) 2018 Alexei Starovoitov. All rights reserved. */
+/* Copyright(c) 2018 Intel Corporation. All rights reserved. */
+/* Copyright(c) 2018 Citrix Systems R&D Ltd. All rights reserved. */
+
+#ifndef XEN_NOSPEC_H
+#define XEN_NOSPEC_H
+
+#include <asm/system.h>
+
+/**
+ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
+ * @index: array element index
+ * @size: number of elements in array
+ *
+ * When @index is out of bounds (@index >= @size), the sign bit will be
+ * set.  Extend the sign bit to all bits and invert, giving a result of
+ * zero for an out of bounds index, or ~0 if within bounds [0, @size).
+ */
+#ifndef array_index_mask_nospec
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+                                                    unsigned long size)
+{
+    /*
+     * Always calculate and emit the mask even if the compiler
+     * thinks the mask is not needed. The compiler does not take
+     * into account the value of @index under speculation.
+     */
+    OPTIMIZER_HIDE_VAR(index);
+    return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
+}
+#endif
+
+/*
+ * array_index_nospec - sanitize an array index after a bounds check
+ *
+ * For a code sequence like:
+ *
+ *     if (index < size) {
+ *         index = array_index_nospec(index, size);
+ *         val = array[index];
+ *     }
+ *
+ * ...if the CPU speculates past the bounds check then
+ * array_index_nospec() will clamp the index within the range of [0,
+ * size).
+ */
+#define array_index_nospec(index, size)                                 \
+({                                                                      \
+    typeof(index) _i = (index);                                         \
+    typeof(size) _s = (size);                                           \
+    unsigned long _mask = array_index_mask_nospec(_i, _s);              \
+                                                                        \
+    BUILD_BUG_ON(sizeof(_i) > sizeof(long));                            \
+    BUILD_BUG_ON(sizeof(_s) > sizeof(long));                            \
+                                                                        \
+    (typeof(_i)) (_i & _mask);                                          \
+})
+
+#endif /* XEN_NOSPEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 99d2af2e1f..e79d5a36ca 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -788,7 +788,7 @@ static inline struct domain *next_domain_in_cpupool(
 #define _VPF_parked          8
 #define VPF_parked           (1UL<<_VPF_parked)

-static inline int vcpu_runnable(struct vcpu *v)
+static inline bool vcpu_runnable(const struct vcpu *v)
 {
     return !(v->pause_flags |
              atomic_read(&v->pause_count) |
diff --git a/xen/include/xen/tasklet.h b/xen/include/xen/tasklet.h
index 23d69c738e..bc9ddace6d 100644
--- a/xen/include/xen/tasklet.h
+++ b/xen/include/xen/tasklet.h
@@ -50,6 +50,11 @@ static inline bool tasklet_work_to_do(unsigned int cpu)
                                                 TASKLET_scheduled);
 }

+static inline bool tasklet_is_scheduled(const struct tasklet *t)
+{
+    return t->scheduled_on != -1;
+}
+
 void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu);
 void tasklet_schedule(struct tasklet *t);
 void do_tasklet(void);
diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h
index cc2673d8ae..9aa5edf593 100644
--- a/xen/include/xen/xmalloc.h
+++ b/xen/include/xen/xmalloc.h
@@ -26,6 +26,12 @@
 /* Free any of the above. */
 extern void xfree(void *);

+/* Free an allocation, and zero the pointer to it. */
+#define XFREE(p) do { \
+    xfree(p);         \
+    (p) = NULL;       \
+} while ( false )
+
 /* Underlying functions */
 extern void *_xmalloc(unsigned long size, unsigned long align);
 extern void *_xzalloc(unsigned long size, unsigned long align);