summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThorsten Leemhuis <fedora@leemhuis.info>2020-04-08 06:50:36 +0200
committerThorsten Leemhuis <fedora@leemhuis.info>2020-04-08 06:50:36 +0200
commitcc879c0344f54c1231d3cf0ee8fa7a224a750d74 (patch)
treecc7f4047efc24e8c8a4a94a8165301e5fa77261d
parent47c8f95f3de11af5221a8527d818a0f9b88346c0 (diff)
parentff84cb1dd72c40e8679c0122c4a8bb3c235f6b16 (diff)
downloadkernel-cc879c0344f54c1231d3cf0ee8fa7a224a750d74.tar.gz
kernel-cc879c0344f54c1231d3cf0ee8fa7a224a750d74.tar.xz
kernel-cc879c0344f54c1231d3cf0ee8fa7a224a750d74.zip
-rw-r--r--0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch141
-rw-r--r--0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch68
-rw-r--r--gitrev2
-rw-r--r--kernel.spec25
-rw-r--r--sources2
5 files changed, 228 insertions, 10 deletions
diff --git a/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch b/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch
new file mode 100644
index 000000000..1511e4a7a
--- /dev/null
+++ b/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch
@@ -0,0 +1,141 @@
+From 7a7662fe09eb2ccd2eb93ce7261aa47c86111b4d Mon Sep 17 00:00:00 2001
+From: Karol Herbst <kherbst@redhat.com>
+Date: Tue, 24 Mar 2020 21:29:23 +0100
+Subject: [PATCH 1/2] drm/nouveau: workaround runpm fail by disabling PCI power
+ management on certain intel bridges
+
+Fixes the infamous 'runtime PM' bug many users are facing on Laptops with
+Nvidia Pascal GPUs by skipping said PCI power state changes on the GPU.
+
+Depending on the used kernel there might be messages like those in demsg:
+
+"nouveau 0000:01:00.0: Refused to change power state, currently in D3"
+"nouveau 0000:01:00.0: can't change power state from D3cold to D0 (config
+space inaccessible)"
+followed by backtraces of kernel crashes or timeouts within nouveau.
+
+It's still unkown why this issue exists, but this is a reliable workaround
+and solves a very annoying issue for user having to choose between a
+crashing kernel or higher power consumption of their Laptops.
+
+Signed-off-by: Karol Herbst <kherbst@redhat.com>
+Cc: Bjorn Helgaas <bhelgaas@google.com>
+Cc: Lyude Paul <lyude@redhat.com>
+Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
+Cc: Mika Westerberg <mika.westerberg@intel.com>
+Cc: linux-pci@vger.kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: dri-devel@lists.freedesktop.org
+Cc: nouveau@lists.freedesktop.org
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=205623
+Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
+---
+ drivers/gpu/drm/nouveau/nouveau_drm.c | 63 +++++++++++++++++++++++++++
+ drivers/gpu/drm/nouveau/nouveau_drv.h | 2 +
+ 2 files changed, 65 insertions(+)
+
+diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
+index 6b1629c14dd7..ca4087f5a15b 100644
+--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
++++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
+@@ -618,6 +618,64 @@ nouveau_drm_device_fini(struct drm_device *dev)
+ kfree(drm);
+ }
+
++/*
++ * On some Intel PCIe bridge controllers doing a
++ * D0 -> D3hot -> D3cold -> D0 sequence causes Nvidia GPUs to not reappear.
++ * Skipping the intermediate D3hot step seems to make it work again. This is
++ * probably caused by not meeting the expectation the involved AML code has
++ * when the GPU is put into D3hot state before invoking it.
++ *
++ * This leads to various manifestations of this issue:
++ * - AML code execution to power on the GPU hits an infinite loop (as the
++ * code waits on device memory to change).
++ * - kernel crashes, as all PCI reads return -1, which most code isn't able
++ * to handle well enough.
++ *
++ * In all cases dmesg will contain at least one line like this:
++ * 'nouveau 0000:01:00.0: Refused to change power state, currently in D3'
++ * followed by a lot of nouveau timeouts.
++ *
++ * In the \_SB.PCI0.PEG0.PG00._OFF code deeper down writes bit 0x80 to the not
++ * documented PCI config space register 0x248 of the Intel PCIe bridge
++ * controller (0x1901) in order to change the state of the PCIe link between
++ * the PCIe port and the GPU. There are alternative code paths using other
++ * registers, which seem to work fine (executed pre Windows 8):
++ * - 0xbc bit 0x20 (publicly available documentation claims 'reserved')
++ * - 0xb0 bit 0x10 (link disable)
++ * Changing the conditions inside the firmware by poking into the relevant
++ * addresses does resolve the issue, but it seemed to be ACPI private memory
++ * and not any device accessible memory at all, so there is no portable way of
++ * changing the conditions.
++ * On a XPS 9560 that means bits [0,3] on \CPEX need to be cleared.
++ *
++ * The only systems where this behavior can be seen are hybrid graphics laptops
++ * with a secondary Nvidia Maxwell, Pascal or Turing GPU. It's unclear whether
++ * this issue only occurs in combination with listed Intel PCIe bridge
++ * controllers and the mentioned GPUs or other devices as well.
++ *
++ * documentation on the PCIe bridge controller can be found in the
++ * "7th Generation Intel® Processor Families for H Platforms Datasheet Volume 2"
++ * Section "12 PCI Express* Controller (x16) Registers"
++ */
++
++static void quirk_broken_nv_runpm(struct pci_dev *pdev)
++{
++ struct drm_device *dev = pci_get_drvdata(pdev);
++ struct nouveau_drm *drm = nouveau_drm(dev);
++ struct pci_dev *bridge = pci_upstream_bridge(pdev);
++
++ if (!bridge || bridge->vendor != PCI_VENDOR_ID_INTEL)
++ return;
++
++ switch (bridge->device) {
++ case 0x1901:
++ drm->old_pm_cap = pdev->pm_cap;
++ pdev->pm_cap = 0;
++ NV_INFO(drm, "Disabling PCI power management to avoid bug\n");
++ break;
++ }
++}
++
+ static int nouveau_drm_probe(struct pci_dev *pdev,
+ const struct pci_device_id *pent)
+ {
+@@ -699,6 +757,7 @@ static int nouveau_drm_probe(struct pci_dev *pdev,
+ if (ret)
+ goto fail_drm_dev_init;
+
++ quirk_broken_nv_runpm(pdev);
+ return 0;
+
+ fail_drm_dev_init:
+@@ -734,7 +793,11 @@ static void
+ nouveau_drm_remove(struct pci_dev *pdev)
+ {
+ struct drm_device *dev = pci_get_drvdata(pdev);
++ struct nouveau_drm *drm = nouveau_drm(dev);
+
++ /* revert our workaround */
++ if (drm->old_pm_cap)
++ pdev->pm_cap = drm->old_pm_cap;
+ nouveau_drm_device_remove(dev);
+ pci_disable_device(pdev);
+ }
+diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
+index c2c332fbde97..2a6519737800 100644
+--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
++++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
+@@ -140,6 +140,8 @@ struct nouveau_drm {
+
+ struct list_head clients;
+
++ u8 old_pm_cap;
++
+ struct {
+ struct agp_bridge_data *bridge;
+ u32 base;
+--
+2.25.1
+
diff --git a/0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch b/0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch
new file mode 100644
index 000000000..554800010
--- /dev/null
+++ b/0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch
@@ -0,0 +1,68 @@
+From 37b556606d1217b4367e622d88cef11c65764386 Mon Sep 17 00:00:00 2001
+From: Ben Skeggs <bskeggs@redhat.com>
+Date: Tue, 31 Mar 2020 16:08:44 +1000
+Subject: [PATCH 2/2] drm/nouveau/gr/gp107,gp108: implement workaround for HW
+ hanging during init
+
+Certain boards with GP107/GP108 chipsets hang (often, but randomly) for
+unknown reasons during GR initialisation.
+
+The first tell-tale symptom of this issue is:
+
+nouveau 0000:01:00.0: bus: MMIO read of 00000000 FAULT at 409800 [ TIMEOUT ]
+
+appearing in dmesg, likely followed by many other failures being logged.
+
+Karol found this WAR for the issue a while back, but efforts to isolate
+the root cause and proper fix have not yielded success so far. I've
+modified the original patch to include a few more details, limit it to
+GP107/GP108 by default, and added a config option to override this choice.
+
+Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
+Reviewed-by: Karol Herbst <kherbst@redhat.com>
+---
+ .../gpu/drm/nouveau/nvkm/engine/gr/gf100.c | 26 +++++++++++++++++++
+ 1 file changed, 26 insertions(+)
+
+diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
+index dd8f85b8b3a7..f2f5636efac4 100644
+--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
++++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
+@@ -1981,8 +1981,34 @@ gf100_gr_init_(struct nvkm_gr *base)
+ {
+ struct gf100_gr *gr = gf100_gr(base);
+ struct nvkm_subdev *subdev = &base->engine.subdev;
++ struct nvkm_device *device = subdev->device;
++ bool reset = device->chipset == 0x137 || device->chipset == 0x138;
+ u32 ret;
+
++ /* On certain GP107/GP108 boards, we trigger a weird issue where
++ * GR will stop responding to PRI accesses after we've asked the
++ * SEC2 RTOS to boot the GR falcons. This happens with far more
++ * frequency when cold-booting a board (ie. returning from D3).
++ *
++ * The root cause for this is not known and has proven difficult
++ * to isolate, with many avenues being dead-ends.
++ *
++ * A workaround was discovered by Karol, whereby putting GR into
++ * reset for an extended period right before initialisation
++ * prevents the problem from occuring.
++ *
++ * XXX: As RM does not require any such workaround, this is more
++ * of a hack than a true fix.
++ */
++ reset = nvkm_boolopt(device->cfgopt, "NvGrResetWar", reset);
++ if (reset) {
++ nvkm_mask(device, 0x000200, 0x00001000, 0x00000000);
++ nvkm_rd32(device, 0x000200);
++ msleep(50);
++ nvkm_mask(device, 0x000200, 0x00001000, 0x00001000);
++ nvkm_rd32(device, 0x000200);
++ }
++
+ nvkm_pmu_pgob(gr->base.engine.subdev.device->pmu, false);
+
+ ret = nvkm_falcon_get(&gr->fecs.falcon, subdev);
+--
+2.25.1
+
diff --git a/gitrev b/gitrev
index 5559cc881..344940eed 100644
--- a/gitrev
+++ b/gitrev
@@ -1 +1 @@
-a10c9c710f9ecea87b9f4bbb837467893b4bef01
+7e63420847ae5f1036e4f7c42f0b3282e73efbc2
diff --git a/kernel.spec b/kernel.spec
index 851259826..df5a75313 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -115,7 +115,7 @@ Summary: The Linux kernel
# The rc snapshot level
%global rcrev 0
# The git snapshot level
-%define gitrev 6
+%define gitrev 7
# Set rpm version accordingly
%define rpmversion 5.%{upstream_sublevel}.0
%endif
@@ -838,20 +838,17 @@ Patch304: ARM-tegra-usb-no-reset.patch
# Raspberry Pi
# v5 https://patchwork.kernel.org/cover/11429245/
-Patch311: USB-pci-quirks-Add-Raspberry-Pi-4-quirk.patch
+Patch310: USB-pci-quirks-Add-Raspberry-Pi-4-quirk.patch
# Tegra bits
# http://patchwork.ozlabs.org/patch/1243112/
-Patch325: backlight-lp855x-Ensure-regulators-are-disabled-on-probe-failure.patch
+Patch320: backlight-lp855x-Ensure-regulators-are-disabled-on-probe-failure.patch
# https://patchwork.ozlabs.org/patch/1261638/
-Patch326: arm64-drm-tegra-Fix-SMMU-support-on-Tegra124-and-Tegra210.patch
-
-# Coral
+Patch321: arm64-drm-tegra-Fix-SMMU-support-on-Tegra124-and-Tegra210.patch
# Pine64 bits
-# 340-345 queued for 5.7
# https://patchwork.kernel.org/cover/11440399/
-Patch346: Add-support-for-PinePhone-LCD-panel.patch
+Patch330: Add-support-for-PinePhone-LCD-panel.patch
# 400 - IBM (ppc/s390x) patches
@@ -885,6 +882,12 @@ Patch511: 0001-ALSA-hda-realtek-Add-quirk-for-Lenovo-Carbon-X1-8th-.patch
# Fixes build on s390 and should be upstream after rc1
Patch512: export_sysrq_mask.patch
+# nouveau runpm and secboot fixes
+# Accepted nouveau upstream https://github.com/skeggsb/nouveau/commit/f5755e7069d4acbcce1a93692421f358241ead7b
+Patch513: 0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch
+# Accepted nouveau upstream https://github.com/skeggsb/nouveau/commit/41c6a13e8143af71928749ea9895d2ebc2fb4ffd
+Patch514: 0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch
+
# END OF PATCH DEFINITIONS
%endif
@@ -2983,6 +2986,12 @@ fi
#
#
%changelog
+* Tue Apr 07 2020 Justin M. Forbes <jforbes@fedoraproject.org> - 5.7.0-0.rc0.git7.1
+- Linux v5.6-11448-g7e63420847ae
+
+* Tue Apr 07 2020 Karol Herbst <kherbst@redhat.com>
+- Add patches to fix nouveau issues preventing booting the installer or system
+
* Mon Apr 06 2020 Justin M. Forbes <jforbes@fedoraproject.org> - 5.7.0-0.rc0.git6.1
- Linux v5.6-11374-ga10c9c710f9e
diff --git a/sources b/sources
index bbc44a295..34cdb48f7 100644
--- a/sources
+++ b/sources
@@ -1,2 +1,2 @@
SHA512 (linux-5.6.tar.xz) = 80846fe2b4e4a7ff471d2dde28a8216ae807a3209f959e93d39ea4fc9a189ea28ec3db9d303b3fe15a28c2cb90e7446876678e93e23353c2d6f262e364a06bc9
-SHA512 (patch-5.6-git6.xz) = a47a364b1c28ce9aba00ef7e4698242489cbc3cfd48778bb23181c589fb611120b039400b291614115d040fa71d9040b292571d2f1f13736831c55b12d25ee47
+SHA512 (patch-5.6-git7.xz) = c35fd3725024f8077a05914e277dd3a05114a198f3fd53ce7dec4e3ab3c9191d3d48baecbd707d9c8cd439662f1043e15bde4cd30dfd5f620953fd0dd82ef6a5