diff options
author | Thorsten Leemhuis <fedora@leemhuis.info> | 2020-04-21 15:51:33 +0200 |
---|---|---|
committer | Thorsten Leemhuis <fedora@leemhuis.info> | 2020-04-21 15:51:33 +0200 |
commit | 026b0e49879529dcaf198efcf49ccec280fedfa0 (patch) | |
tree | 7e2788fa485cb8ea57f00d7dd62bc87de023be0f /0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch | |
parent | 8751e34793a6ea7d373d81439c3e3b6a4d227ef3 (diff) | |
parent | 71893eb2f68d8dd59d462be73e9832ccb4ccfe9d (diff) | |
download | kernel-026b0e49879529dcaf198efcf49ccec280fedfa0.tar.gz kernel-026b0e49879529dcaf198efcf49ccec280fedfa0.tar.xz kernel-026b0e49879529dcaf198efcf49ccec280fedfa0.zip |
Merge remote-tracking branch 'origin/f31' into f31-user-thl-vanilla-fedorakernel-5.6.6-250.vanilla.knurd.1.fc31
Diffstat (limited to '0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch')
-rw-r--r-- | 0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch b/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch new file mode 100644 index 000000000..1511e4a7a --- /dev/null +++ b/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch @@ -0,0 +1,141 @@ +From 7a7662fe09eb2ccd2eb93ce7261aa47c86111b4d Mon Sep 17 00:00:00 2001 +From: Karol Herbst <kherbst@redhat.com> +Date: Tue, 24 Mar 2020 21:29:23 +0100 +Subject: [PATCH 1/2] drm/nouveau: workaround runpm fail by disabling PCI power + management on certain intel bridges + +Fixes the infamous 'runtime PM' bug many users are facing on Laptops with +Nvidia Pascal GPUs by skipping said PCI power state changes on the GPU. + +Depending on the used kernel there might be messages like those in demsg: + +"nouveau 0000:01:00.0: Refused to change power state, currently in D3" +"nouveau 0000:01:00.0: can't change power state from D3cold to D0 (config +space inaccessible)" +followed by backtraces of kernel crashes or timeouts within nouveau. + +It's still unkown why this issue exists, but this is a reliable workaround +and solves a very annoying issue for user having to choose between a +crashing kernel or higher power consumption of their Laptops. + +Signed-off-by: Karol Herbst <kherbst@redhat.com> +Cc: Bjorn Helgaas <bhelgaas@google.com> +Cc: Lyude Paul <lyude@redhat.com> +Cc: Rafael J. Wysocki <rjw@rjwysocki.net> +Cc: Mika Westerberg <mika.westerberg@intel.com> +Cc: linux-pci@vger.kernel.org +Cc: linux-pm@vger.kernel.org +Cc: dri-devel@lists.freedesktop.org +Cc: nouveau@lists.freedesktop.org +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=205623 +Signed-off-by: Ben Skeggs <bskeggs@redhat.com> +--- + drivers/gpu/drm/nouveau/nouveau_drm.c | 63 +++++++++++++++++++++++++++ + drivers/gpu/drm/nouveau/nouveau_drv.h | 2 + + 2 files changed, 65 insertions(+) + +diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c +index 6b1629c14dd7..ca4087f5a15b 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_drm.c ++++ b/drivers/gpu/drm/nouveau/nouveau_drm.c +@@ -618,6 +618,64 @@ nouveau_drm_device_fini(struct drm_device *dev) + kfree(drm); + } + ++/* ++ * On some Intel PCIe bridge controllers doing a ++ * D0 -> D3hot -> D3cold -> D0 sequence causes Nvidia GPUs to not reappear. ++ * Skipping the intermediate D3hot step seems to make it work again. This is ++ * probably caused by not meeting the expectation the involved AML code has ++ * when the GPU is put into D3hot state before invoking it. ++ * ++ * This leads to various manifestations of this issue: ++ * - AML code execution to power on the GPU hits an infinite loop (as the ++ * code waits on device memory to change). ++ * - kernel crashes, as all PCI reads return -1, which most code isn't able ++ * to handle well enough. ++ * ++ * In all cases dmesg will contain at least one line like this: ++ * 'nouveau 0000:01:00.0: Refused to change power state, currently in D3' ++ * followed by a lot of nouveau timeouts. ++ * ++ * In the \_SB.PCI0.PEG0.PG00._OFF code deeper down writes bit 0x80 to the not ++ * documented PCI config space register 0x248 of the Intel PCIe bridge ++ * controller (0x1901) in order to change the state of the PCIe link between ++ * the PCIe port and the GPU. There are alternative code paths using other ++ * registers, which seem to work fine (executed pre Windows 8): ++ * - 0xbc bit 0x20 (publicly available documentation claims 'reserved') ++ * - 0xb0 bit 0x10 (link disable) ++ * Changing the conditions inside the firmware by poking into the relevant ++ * addresses does resolve the issue, but it seemed to be ACPI private memory ++ * and not any device accessible memory at all, so there is no portable way of ++ * changing the conditions. ++ * On a XPS 9560 that means bits [0,3] on \CPEX need to be cleared. ++ * ++ * The only systems where this behavior can be seen are hybrid graphics laptops ++ * with a secondary Nvidia Maxwell, Pascal or Turing GPU. It's unclear whether ++ * this issue only occurs in combination with listed Intel PCIe bridge ++ * controllers and the mentioned GPUs or other devices as well. ++ * ++ * documentation on the PCIe bridge controller can be found in the ++ * "7th Generation Intel® Processor Families for H Platforms Datasheet Volume 2" ++ * Section "12 PCI Express* Controller (x16) Registers" ++ */ ++ ++static void quirk_broken_nv_runpm(struct pci_dev *pdev) ++{ ++ struct drm_device *dev = pci_get_drvdata(pdev); ++ struct nouveau_drm *drm = nouveau_drm(dev); ++ struct pci_dev *bridge = pci_upstream_bridge(pdev); ++ ++ if (!bridge || bridge->vendor != PCI_VENDOR_ID_INTEL) ++ return; ++ ++ switch (bridge->device) { ++ case 0x1901: ++ drm->old_pm_cap = pdev->pm_cap; ++ pdev->pm_cap = 0; ++ NV_INFO(drm, "Disabling PCI power management to avoid bug\n"); ++ break; ++ } ++} ++ + static int nouveau_drm_probe(struct pci_dev *pdev, + const struct pci_device_id *pent) + { +@@ -699,6 +757,7 @@ static int nouveau_drm_probe(struct pci_dev *pdev, + if (ret) + goto fail_drm_dev_init; + ++ quirk_broken_nv_runpm(pdev); + return 0; + + fail_drm_dev_init: +@@ -734,7 +793,11 @@ static void + nouveau_drm_remove(struct pci_dev *pdev) + { + struct drm_device *dev = pci_get_drvdata(pdev); ++ struct nouveau_drm *drm = nouveau_drm(dev); + ++ /* revert our workaround */ ++ if (drm->old_pm_cap) ++ pdev->pm_cap = drm->old_pm_cap; + nouveau_drm_device_remove(dev); + pci_disable_device(pdev); + } +diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h +index c2c332fbde97..2a6519737800 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_drv.h ++++ b/drivers/gpu/drm/nouveau/nouveau_drv.h +@@ -140,6 +140,8 @@ struct nouveau_drm { + + struct list_head clients; + ++ u8 old_pm_cap; ++ + struct { + struct agp_bridge_data *bridge; + u32 base; +-- +2.25.1 + |