diff options
author | Thorsten Leemhuis <fedora@leemhuis.info> | 2020-04-08 06:50:36 +0200 |
---|---|---|
committer | Thorsten Leemhuis <fedora@leemhuis.info> | 2020-04-08 06:50:36 +0200 |
commit | cc879c0344f54c1231d3cf0ee8fa7a224a750d74 (patch) | |
tree | cc7f4047efc24e8c8a4a94a8165301e5fa77261d | |
parent | 47c8f95f3de11af5221a8527d818a0f9b88346c0 (diff) | |
parent | ff84cb1dd72c40e8679c0122c4a8bb3c235f6b16 (diff) | |
download | kernel-cc879c0344f54c1231d3cf0ee8fa7a224a750d74.tar.gz kernel-cc879c0344f54c1231d3cf0ee8fa7a224a750d74.tar.xz kernel-cc879c0344f54c1231d3cf0ee8fa7a224a750d74.zip |
Merge remote-tracking branch 'origin/master' into rawhide-user-thl-vanilla-fedorakernel-5.7.0-0.rc0.git7.1.vanilla.knurd.1.fc33kernel-5.7.0-0.rc0.git7.1.vanilla.knurd.1.fc32kernel-5.7.0-0.rc0.git7.1.vanilla.knurd.1.fc31kernel-5.7.0-0.rc0.git7.1.vanilla.knurd.1.fc30
-rw-r--r-- | 0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch | 141 | ||||
-rw-r--r-- | 0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch | 68 | ||||
-rw-r--r-- | gitrev | 2 | ||||
-rw-r--r-- | kernel.spec | 25 | ||||
-rw-r--r-- | sources | 2 |
5 files changed, 228 insertions, 10 deletions
diff --git a/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch b/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch new file mode 100644 index 000000000..1511e4a7a --- /dev/null +++ b/0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch @@ -0,0 +1,141 @@ +From 7a7662fe09eb2ccd2eb93ce7261aa47c86111b4d Mon Sep 17 00:00:00 2001 +From: Karol Herbst <kherbst@redhat.com> +Date: Tue, 24 Mar 2020 21:29:23 +0100 +Subject: [PATCH 1/2] drm/nouveau: workaround runpm fail by disabling PCI power + management on certain intel bridges + +Fixes the infamous 'runtime PM' bug many users are facing on Laptops with +Nvidia Pascal GPUs by skipping said PCI power state changes on the GPU. + +Depending on the used kernel there might be messages like those in demsg: + +"nouveau 0000:01:00.0: Refused to change power state, currently in D3" +"nouveau 0000:01:00.0: can't change power state from D3cold to D0 (config +space inaccessible)" +followed by backtraces of kernel crashes or timeouts within nouveau. + +It's still unkown why this issue exists, but this is a reliable workaround +and solves a very annoying issue for user having to choose between a +crashing kernel or higher power consumption of their Laptops. + +Signed-off-by: Karol Herbst <kherbst@redhat.com> +Cc: Bjorn Helgaas <bhelgaas@google.com> +Cc: Lyude Paul <lyude@redhat.com> +Cc: Rafael J. Wysocki <rjw@rjwysocki.net> +Cc: Mika Westerberg <mika.westerberg@intel.com> +Cc: linux-pci@vger.kernel.org +Cc: linux-pm@vger.kernel.org +Cc: dri-devel@lists.freedesktop.org +Cc: nouveau@lists.freedesktop.org +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=205623 +Signed-off-by: Ben Skeggs <bskeggs@redhat.com> +--- + drivers/gpu/drm/nouveau/nouveau_drm.c | 63 +++++++++++++++++++++++++++ + drivers/gpu/drm/nouveau/nouveau_drv.h | 2 + + 2 files changed, 65 insertions(+) + +diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c +index 6b1629c14dd7..ca4087f5a15b 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_drm.c ++++ b/drivers/gpu/drm/nouveau/nouveau_drm.c +@@ -618,6 +618,64 @@ nouveau_drm_device_fini(struct drm_device *dev) + kfree(drm); + } + ++/* ++ * On some Intel PCIe bridge controllers doing a ++ * D0 -> D3hot -> D3cold -> D0 sequence causes Nvidia GPUs to not reappear. ++ * Skipping the intermediate D3hot step seems to make it work again. This is ++ * probably caused by not meeting the expectation the involved AML code has ++ * when the GPU is put into D3hot state before invoking it. ++ * ++ * This leads to various manifestations of this issue: ++ * - AML code execution to power on the GPU hits an infinite loop (as the ++ * code waits on device memory to change). ++ * - kernel crashes, as all PCI reads return -1, which most code isn't able ++ * to handle well enough. ++ * ++ * In all cases dmesg will contain at least one line like this: ++ * 'nouveau 0000:01:00.0: Refused to change power state, currently in D3' ++ * followed by a lot of nouveau timeouts. ++ * ++ * In the \_SB.PCI0.PEG0.PG00._OFF code deeper down writes bit 0x80 to the not ++ * documented PCI config space register 0x248 of the Intel PCIe bridge ++ * controller (0x1901) in order to change the state of the PCIe link between ++ * the PCIe port and the GPU. There are alternative code paths using other ++ * registers, which seem to work fine (executed pre Windows 8): ++ * - 0xbc bit 0x20 (publicly available documentation claims 'reserved') ++ * - 0xb0 bit 0x10 (link disable) ++ * Changing the conditions inside the firmware by poking into the relevant ++ * addresses does resolve the issue, but it seemed to be ACPI private memory ++ * and not any device accessible memory at all, so there is no portable way of ++ * changing the conditions. ++ * On a XPS 9560 that means bits [0,3] on \CPEX need to be cleared. ++ * ++ * The only systems where this behavior can be seen are hybrid graphics laptops ++ * with a secondary Nvidia Maxwell, Pascal or Turing GPU. It's unclear whether ++ * this issue only occurs in combination with listed Intel PCIe bridge ++ * controllers and the mentioned GPUs or other devices as well. ++ * ++ * documentation on the PCIe bridge controller can be found in the ++ * "7th Generation Intel® Processor Families for H Platforms Datasheet Volume 2" ++ * Section "12 PCI Express* Controller (x16) Registers" ++ */ ++ ++static void quirk_broken_nv_runpm(struct pci_dev *pdev) ++{ ++ struct drm_device *dev = pci_get_drvdata(pdev); ++ struct nouveau_drm *drm = nouveau_drm(dev); ++ struct pci_dev *bridge = pci_upstream_bridge(pdev); ++ ++ if (!bridge || bridge->vendor != PCI_VENDOR_ID_INTEL) ++ return; ++ ++ switch (bridge->device) { ++ case 0x1901: ++ drm->old_pm_cap = pdev->pm_cap; ++ pdev->pm_cap = 0; ++ NV_INFO(drm, "Disabling PCI power management to avoid bug\n"); ++ break; ++ } ++} ++ + static int nouveau_drm_probe(struct pci_dev *pdev, + const struct pci_device_id *pent) + { +@@ -699,6 +757,7 @@ static int nouveau_drm_probe(struct pci_dev *pdev, + if (ret) + goto fail_drm_dev_init; + ++ quirk_broken_nv_runpm(pdev); + return 0; + + fail_drm_dev_init: +@@ -734,7 +793,11 @@ static void + nouveau_drm_remove(struct pci_dev *pdev) + { + struct drm_device *dev = pci_get_drvdata(pdev); ++ struct nouveau_drm *drm = nouveau_drm(dev); + ++ /* revert our workaround */ ++ if (drm->old_pm_cap) ++ pdev->pm_cap = drm->old_pm_cap; + nouveau_drm_device_remove(dev); + pci_disable_device(pdev); + } +diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h +index c2c332fbde97..2a6519737800 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_drv.h ++++ b/drivers/gpu/drm/nouveau/nouveau_drv.h +@@ -140,6 +140,8 @@ struct nouveau_drm { + + struct list_head clients; + ++ u8 old_pm_cap; ++ + struct { + struct agp_bridge_data *bridge; + u32 base; +-- +2.25.1 + diff --git a/0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch b/0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch new file mode 100644 index 000000000..554800010 --- /dev/null +++ b/0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch @@ -0,0 +1,68 @@ +From 37b556606d1217b4367e622d88cef11c65764386 Mon Sep 17 00:00:00 2001 +From: Ben Skeggs <bskeggs@redhat.com> +Date: Tue, 31 Mar 2020 16:08:44 +1000 +Subject: [PATCH 2/2] drm/nouveau/gr/gp107,gp108: implement workaround for HW + hanging during init + +Certain boards with GP107/GP108 chipsets hang (often, but randomly) for +unknown reasons during GR initialisation. + +The first tell-tale symptom of this issue is: + +nouveau 0000:01:00.0: bus: MMIO read of 00000000 FAULT at 409800 [ TIMEOUT ] + +appearing in dmesg, likely followed by many other failures being logged. + +Karol found this WAR for the issue a while back, but efforts to isolate +the root cause and proper fix have not yielded success so far. I've +modified the original patch to include a few more details, limit it to +GP107/GP108 by default, and added a config option to override this choice. + +Signed-off-by: Ben Skeggs <bskeggs@redhat.com> +Reviewed-by: Karol Herbst <kherbst@redhat.com> +--- + .../gpu/drm/nouveau/nvkm/engine/gr/gf100.c | 26 +++++++++++++++++++ + 1 file changed, 26 insertions(+) + +diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c +index dd8f85b8b3a7..f2f5636efac4 100644 +--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c ++++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c +@@ -1981,8 +1981,34 @@ gf100_gr_init_(struct nvkm_gr *base) + { + struct gf100_gr *gr = gf100_gr(base); + struct nvkm_subdev *subdev = &base->engine.subdev; ++ struct nvkm_device *device = subdev->device; ++ bool reset = device->chipset == 0x137 || device->chipset == 0x138; + u32 ret; + ++ /* On certain GP107/GP108 boards, we trigger a weird issue where ++ * GR will stop responding to PRI accesses after we've asked the ++ * SEC2 RTOS to boot the GR falcons. This happens with far more ++ * frequency when cold-booting a board (ie. returning from D3). ++ * ++ * The root cause for this is not known and has proven difficult ++ * to isolate, with many avenues being dead-ends. ++ * ++ * A workaround was discovered by Karol, whereby putting GR into ++ * reset for an extended period right before initialisation ++ * prevents the problem from occuring. ++ * ++ * XXX: As RM does not require any such workaround, this is more ++ * of a hack than a true fix. ++ */ ++ reset = nvkm_boolopt(device->cfgopt, "NvGrResetWar", reset); ++ if (reset) { ++ nvkm_mask(device, 0x000200, 0x00001000, 0x00000000); ++ nvkm_rd32(device, 0x000200); ++ msleep(50); ++ nvkm_mask(device, 0x000200, 0x00001000, 0x00001000); ++ nvkm_rd32(device, 0x000200); ++ } ++ + nvkm_pmu_pgob(gr->base.engine.subdev.device->pmu, false); + + ret = nvkm_falcon_get(&gr->fecs.falcon, subdev); +-- +2.25.1 + @@ -1 +1 @@ -a10c9c710f9ecea87b9f4bbb837467893b4bef01 +7e63420847ae5f1036e4f7c42f0b3282e73efbc2 diff --git a/kernel.spec b/kernel.spec index 851259826..df5a75313 100644 --- a/kernel.spec +++ b/kernel.spec @@ -115,7 +115,7 @@ Summary: The Linux kernel # The rc snapshot level %global rcrev 0 # The git snapshot level -%define gitrev 6 +%define gitrev 7 # Set rpm version accordingly %define rpmversion 5.%{upstream_sublevel}.0 %endif @@ -838,20 +838,17 @@ Patch304: ARM-tegra-usb-no-reset.patch # Raspberry Pi # v5 https://patchwork.kernel.org/cover/11429245/ -Patch311: USB-pci-quirks-Add-Raspberry-Pi-4-quirk.patch +Patch310: USB-pci-quirks-Add-Raspberry-Pi-4-quirk.patch # Tegra bits # http://patchwork.ozlabs.org/patch/1243112/ -Patch325: backlight-lp855x-Ensure-regulators-are-disabled-on-probe-failure.patch +Patch320: backlight-lp855x-Ensure-regulators-are-disabled-on-probe-failure.patch # https://patchwork.ozlabs.org/patch/1261638/ -Patch326: arm64-drm-tegra-Fix-SMMU-support-on-Tegra124-and-Tegra210.patch - -# Coral +Patch321: arm64-drm-tegra-Fix-SMMU-support-on-Tegra124-and-Tegra210.patch # Pine64 bits -# 340-345 queued for 5.7 # https://patchwork.kernel.org/cover/11440399/ -Patch346: Add-support-for-PinePhone-LCD-panel.patch +Patch330: Add-support-for-PinePhone-LCD-panel.patch # 400 - IBM (ppc/s390x) patches @@ -885,6 +882,12 @@ Patch511: 0001-ALSA-hda-realtek-Add-quirk-for-Lenovo-Carbon-X1-8th-.patch # Fixes build on s390 and should be upstream after rc1 Patch512: export_sysrq_mask.patch +# nouveau runpm and secboot fixes +# Accepted nouveau upstream https://github.com/skeggsb/nouveau/commit/f5755e7069d4acbcce1a93692421f358241ead7b +Patch513: 0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch +# Accepted nouveau upstream https://github.com/skeggsb/nouveau/commit/41c6a13e8143af71928749ea9895d2ebc2fb4ffd +Patch514: 0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch + # END OF PATCH DEFINITIONS %endif @@ -2983,6 +2986,12 @@ fi # # %changelog +* Tue Apr 07 2020 Justin M. Forbes <jforbes@fedoraproject.org> - 5.7.0-0.rc0.git7.1 +- Linux v5.6-11448-g7e63420847ae + +* Tue Apr 07 2020 Karol Herbst <kherbst@redhat.com> +- Add patches to fix nouveau issues preventing booting the installer or system + * Mon Apr 06 2020 Justin M. Forbes <jforbes@fedoraproject.org> - 5.7.0-0.rc0.git6.1 - Linux v5.6-11374-ga10c9c710f9e @@ -1,2 +1,2 @@ SHA512 (linux-5.6.tar.xz) = 80846fe2b4e4a7ff471d2dde28a8216ae807a3209f959e93d39ea4fc9a189ea28ec3db9d303b3fe15a28c2cb90e7446876678e93e23353c2d6f262e364a06bc9 -SHA512 (patch-5.6-git6.xz) = a47a364b1c28ce9aba00ef7e4698242489cbc3cfd48778bb23181c589fb611120b039400b291614115d040fa71d9040b292571d2f1f13736831c55b12d25ee47 +SHA512 (patch-5.6-git7.xz) = c35fd3725024f8077a05914e277dd3a05114a198f3fd53ce7dec4e3ab3c9191d3d48baecbd707d9c8cd439662f1043e15bde4cd30dfd5f620953fd0dd82ef6a5 |