1
0
mirror of https://git.yoctoproject.org/meta-ti synced 2026-06-06 10:50:37 +00:00

linux-ti33x-psp 3.2: update to 3.2.25

Signed-off-by: Koen Kooi <koen@dominion.thruhere.net>
Signed-off-by: Denys Dmytriyenko <denys@ti.com>
This commit is contained in:
Koen Kooi
2012-08-08 10:41:23 +02:00
committed by Denys Dmytriyenko
parent e4ea426bb5
commit 6203cbca5c
184 changed files with 16831 additions and 3 deletions
+1 -1
View File
@@ -4,7 +4,7 @@ require conf/machine/include/soc-family.inc
require conf/machine/include/tune-cortexa8.inc
PREFERRED_PROVIDER_virtual/kernel = "linux-ti33x-psp"
# Increase this everytime you change something in the kernel
MACHINE_KERNEL_PR = "r14"
MACHINE_KERNEL_PR = "r15"
KERNEL_IMAGETYPE = "uImage"
@@ -0,0 +1,306 @@
From fe36d7279bb09c09b2c07b1b8bfe786a3ab12486 Mon Sep 17 00:00:00 2001
From: Corentin Chary <corentincj@iksaif.net>
Date: Sat, 26 Nov 2011 11:00:10 +0100
Subject: [PATCH 001/109] samsung-laptop: make the dmi check less strict
commit 3be324a94df0c3f032178d04549dbfbf6cccb09a upstream.
This enable the driver for everything that look like
a laptop and is from vendor "SAMSUNG ELECTRONICS CO., LTD.".
Note that laptop supported by samsung-q10 seem to have a different
vendor strict.
Also remove every log output until we know that we have a SABI interface
(except if the driver is forced to load, or debug is enabled).
Keeping a whitelist of laptop with a model granularity is something that can't
work without close vendor cooperation (and we don't have that).
Signed-off-by: Corentin Chary <corentincj@iksaif.net>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
[bwh: Backported to 3.2:
- Adjust context
- Drop changes relating to ACPI video]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/platform/x86/samsung-laptop.c | 225 ++-------------------------------
1 files changed, 8 insertions(+), 217 deletions(-)
diff --git a/drivers/platform/x86/samsung-laptop.c b/drivers/platform/x86/samsung-laptop.c
index 09e26bf..af1e296 100644
--- a/drivers/platform/x86/samsung-laptop.c
+++ b/drivers/platform/x86/samsung-laptop.c
@@ -540,245 +540,34 @@ static DEVICE_ATTR(performance_level, S_IWUSR | S_IRUGO,
get_performance_level, set_performance_level);
-static int __init dmi_check_cb(const struct dmi_system_id *id)
-{
- pr_info("found laptop model '%s'\n",
- id->ident);
- return 1;
-}
-
static struct dmi_system_id __initdata samsung_dmi_table[] = {
{
- .ident = "N128",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "N128"),
- DMI_MATCH(DMI_BOARD_NAME, "N128"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "N130",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR,
"SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "N130"),
- DMI_MATCH(DMI_BOARD_NAME, "N130"),
+ DMI_MATCH(DMI_CHASSIS_TYPE, "8"), /* Portable */
},
- .callback = dmi_check_cb,
},
{
- .ident = "N510",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR,
"SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "N510"),
- DMI_MATCH(DMI_BOARD_NAME, "N510"),
+ DMI_MATCH(DMI_CHASSIS_TYPE, "9"), /* Laptop */
},
- .callback = dmi_check_cb,
},
{
- .ident = "X125",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR,
"SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "X125"),
- DMI_MATCH(DMI_BOARD_NAME, "X125"),
+ DMI_MATCH(DMI_CHASSIS_TYPE, "10"), /* Notebook */
},
- .callback = dmi_check_cb,
},
{
- .ident = "X120/X170",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR,
"SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "X120/X170"),
- DMI_MATCH(DMI_BOARD_NAME, "X120/X170"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "NC10",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "NC10"),
- DMI_MATCH(DMI_BOARD_NAME, "NC10"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "NP-Q45",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "SQ45S70S"),
- DMI_MATCH(DMI_BOARD_NAME, "SQ45S70S"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "X360",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "X360"),
- DMI_MATCH(DMI_BOARD_NAME, "X360"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "R410 Plus",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "R410P"),
- DMI_MATCH(DMI_BOARD_NAME, "R460"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "R518",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "R518"),
- DMI_MATCH(DMI_BOARD_NAME, "R518"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "R519/R719",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "R519/R719"),
- DMI_MATCH(DMI_BOARD_NAME, "R519/R719"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "N150/N210/N220",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "N150/N210/N220"),
- DMI_MATCH(DMI_BOARD_NAME, "N150/N210/N220"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "N220",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "N220"),
- DMI_MATCH(DMI_BOARD_NAME, "N220"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "N150/N210/N220/N230",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "N150/N210/N220/N230"),
- DMI_MATCH(DMI_BOARD_NAME, "N150/N210/N220/N230"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "N150P/N210P/N220P",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "N150P/N210P/N220P"),
- DMI_MATCH(DMI_BOARD_NAME, "N150P/N210P/N220P"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "R700",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "SR700"),
- DMI_MATCH(DMI_BOARD_NAME, "SR700"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "R530/R730",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "R530/R730"),
- DMI_MATCH(DMI_BOARD_NAME, "R530/R730"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "NF110/NF210/NF310",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "NF110/NF210/NF310"),
- DMI_MATCH(DMI_BOARD_NAME, "NF110/NF210/NF310"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "N145P/N250P/N260P",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "N145P/N250P/N260P"),
- DMI_MATCH(DMI_BOARD_NAME, "N145P/N250P/N260P"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "R70/R71",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR,
- "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "R70/R71"),
- DMI_MATCH(DMI_BOARD_NAME, "R70/R71"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "P460",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "P460"),
- DMI_MATCH(DMI_BOARD_NAME, "P460"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "R528/R728",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "R528/R728"),
- DMI_MATCH(DMI_BOARD_NAME, "R528/R728"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "NC210/NC110",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "NC210/NC110"),
- DMI_MATCH(DMI_BOARD_NAME, "NC210/NC110"),
- },
- .callback = dmi_check_cb,
- },
- {
- .ident = "X520",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
- DMI_MATCH(DMI_PRODUCT_NAME, "X520"),
- DMI_MATCH(DMI_BOARD_NAME, "X520"),
+ DMI_MATCH(DMI_CHASSIS_TYPE, "14"), /* Sub-Notebook */
},
- .callback = dmi_check_cb,
},
{ },
};
@@ -819,7 +608,8 @@ static int __init samsung_init(void)
f0000_segment = ioremap_nocache(0xf0000, 0xffff);
if (!f0000_segment) {
- pr_err("Can't map the segment at 0xf0000\n");
+ if (debug || force)
+ pr_err("Can't map the segment at 0xf0000\n");
return -EINVAL;
}
@@ -832,7 +622,8 @@ static int __init samsung_init(void)
}
if (loca == 0xffff) {
- pr_err("This computer does not support SABI\n");
+ if (debug || force)
+ pr_err("This computer does not support SABI\n");
goto error_no_signature;
}
--
1.7.7.6
@@ -0,0 +1,43 @@
From c3041d04e675a5d38a1d57fee3c59d4f073f512e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Tue, 3 Jul 2012 15:57:19 +1000
Subject: [PATCH 002/109] raid5: delayed stripe fix
commit fab363b5ff502d1b39ddcfec04271f5858d9f26e upstream.
There isn't locking setting STRIPE_DELAYED and STRIPE_PREREAD_ACTIVE bits, but
the two bits have relationship. A delayed stripe can be moved to hold list only
when preread active stripe count is below IO_THRESHOLD. If a stripe has both
the bits set, such stripe will be in delayed list and preread count not 0,
which will make such stripe never leave delayed list.
Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/md/raid5.c | 4 +++-
1 files changed, 3 insertions(+), 1 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6ba4954..26ef63a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -196,12 +196,14 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
BUG_ON(!list_empty(&sh->lru));
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
- if (test_bit(STRIPE_DELAYED, &sh->state))
+ if (test_bit(STRIPE_DELAYED, &sh->state) &&
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
else {
+ clear_bit(STRIPE_DELAYED, &sh->state);
clear_bit(STRIPE_BIT_DELAY, &sh->state);
list_add_tail(&sh->lru, &conf->handle_list);
}
--
1.7.7.6
@@ -0,0 +1,36 @@
From ebf148a0ca2e0c9fb824a069c0fd5311bb6ae297 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 2 Dec 2011 23:41:42 +0000
Subject: [PATCH 003/109] tcp: drop SYN+FIN messages
commit fdf5af0daf8019cec2396cdef8fb042d80fe71fa upstream.
Denys Fedoryshchenko reported that SYN+FIN attacks were bringing his
linux machines to their limits.
Dont call conn_request() if the TCP flags includes SYN flag
Reported-by: Denys Fedoryshchenko <denys@visp.net.lb>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
net/ipv4/tcp_input.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9726927..32e6ca2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5836,6 +5836,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
if (th->syn) {
+ if (th->fin)
+ goto discard;
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
--
1.7.7.6
@@ -0,0 +1,35 @@
From 389c56a2dfc90eecb97841668a8d61fc3424f2c8 Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Thu, 7 Jun 2012 12:56:54 +0000
Subject: [PATCH 004/109] tg3: Apply short DMA frag workaround to 5906
commit b7abee6ef888117f92db370620ebf116a38e3f4d upstream.
5906 devices also need the short DMA fragment workaround. This patch
makes the necessary change.
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Tested-by: Christian Kujau <lists@nerdbynature.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/ethernet/broadcom/tg3.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 2dcac28..6b258d9 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -14046,7 +14046,8 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
}
}
- if (tg3_flag(tp, 5755_PLUS))
+ if (tg3_flag(tp, 5755_PLUS) ||
+ GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5906)
tg3_flag_set(tp, SHORT_DMA_BUG);
if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719)
--
1.7.7.6
@@ -0,0 +1,56 @@
From c8ad2074ce769ad8b16677e0a9bee9232be03acc Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Wed, 16 May 2012 11:06:21 +0200
Subject: [PATCH 005/109] rtl8187: ->brightness_set can not sleep
commit 0fde0a8cfd0ede7f310d6a681c8e5a7cb3e32406 upstream.
Fix:
BUG: sleeping function called from invalid context at kernel/workqueue.c:2547
in_atomic(): 1, irqs_disabled(): 0, pid: 629, name: wpa_supplicant
2 locks held by wpa_supplicant/629:
#0: (rtnl_mutex){+.+.+.}, at: [<c08b2b84>] rtnl_lock+0x14/0x20
#1: (&trigger->leddev_list_lock){.+.?..}, at: [<c0867f41>] led_trigger_event+0x21/0x80
Pid: 629, comm: wpa_supplicant Not tainted 3.3.0-0.rc3.git5.1.fc17.i686
Call Trace:
[<c046a9f6>] __might_sleep+0x126/0x1d0
[<c0457d6c>] wait_on_work+0x2c/0x1d0
[<c045a09a>] __cancel_work_timer+0x6a/0x120
[<c045a160>] cancel_delayed_work_sync+0x10/0x20
[<f7dd3c22>] rtl8187_led_brightness_set+0x82/0xf0 [rtl8187]
[<c0867f7c>] led_trigger_event+0x5c/0x80
[<f7ff5e6d>] ieee80211_led_radio+0x1d/0x40 [mac80211]
[<f7ff3583>] ieee80211_stop_device+0x13/0x230 [mac80211]
Removing _sync is ok, because if led_on work is currently running
it will be finished before led_off work start to perform, since
they are always queued on the same mac80211 local->workqueue.
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=795176
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Larry Finger <Larry.Finger@lwfinger.net>
Acked-by: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/wireless/rtl818x/rtl8187/leds.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/net/wireless/rtl818x/rtl8187/leds.c b/drivers/net/wireless/rtl818x/rtl8187/leds.c
index 2e0de2f..c2d5b49 100644
--- a/drivers/net/wireless/rtl818x/rtl8187/leds.c
+++ b/drivers/net/wireless/rtl818x/rtl8187/leds.c
@@ -117,7 +117,7 @@ static void rtl8187_led_brightness_set(struct led_classdev *led_dev,
radio_on = true;
} else if (radio_on) {
radio_on = false;
- cancel_delayed_work_sync(&priv->led_on);
+ cancel_delayed_work(&priv->led_on);
ieee80211_queue_delayed_work(hw, &priv->led_off, 0);
}
} else if (radio_on) {
--
1.7.7.6
@@ -0,0 +1,102 @@
From 54490f32c07630f7c6bd6429bf73d7507b06e3db Mon Sep 17 00:00:00 2001
From: Stanislav Yakovlev <stas.yakovlev@gmail.com>
Date: Tue, 10 Apr 2012 21:44:47 -0400
Subject: [PATCH 006/109] net/wireless: ipw2x00: add supported cipher suites
to wiphy initialization
commit a141e6a0097118bb35024485f1faffc0d9042f5c upstream.
Driver doesn't report its supported cipher suites through cfg80211
interface. It still uses wext interface and probably will not work
through nl80211, but will at least correctly advertise supported
features.
Bug was reported by Omar Siam.
https://bugzilla.kernel.org/show_bug.cgi?id=43049
Signed-off-by: Stanislav Yakovlev <stas.yakovlev@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/wireless/ipw2x00/ipw.h | 23 +++++++++++++++++++++++
drivers/net/wireless/ipw2x00/ipw2100.c | 4 ++++
drivers/net/wireless/ipw2x00/ipw2200.c | 4 ++++
3 files changed, 31 insertions(+), 0 deletions(-)
create mode 100644 drivers/net/wireless/ipw2x00/ipw.h
diff --git a/drivers/net/wireless/ipw2x00/ipw.h b/drivers/net/wireless/ipw2x00/ipw.h
new file mode 100644
index 0000000..4007bf5
--- /dev/null
+++ b/drivers/net/wireless/ipw2x00/ipw.h
@@ -0,0 +1,23 @@
+/*
+ * Intel Pro/Wireless 2100, 2200BG, 2915ABG network connection driver
+ *
+ * Copyright 2012 Stanislav Yakovlev <stas.yakovlev@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __IPW_H__
+#define __IPW_H__
+
+#include <linux/ieee80211.h>
+
+static const u32 ipw_cipher_suites[] = {
+ WLAN_CIPHER_SUITE_WEP40,
+ WLAN_CIPHER_SUITE_WEP104,
+ WLAN_CIPHER_SUITE_TKIP,
+ WLAN_CIPHER_SUITE_CCMP,
+};
+
+#endif
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index 127e9c6..10862d4 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -166,6 +166,7 @@ that only one external action is invoked at a time.
#include <net/lib80211.h>
#include "ipw2100.h"
+#include "ipw.h"
#define IPW2100_VERSION "git-1.2.2"
@@ -1955,6 +1956,9 @@ static int ipw2100_wdev_init(struct net_device *dev)
wdev->wiphy->bands[IEEE80211_BAND_2GHZ] = bg_band;
}
+ wdev->wiphy->cipher_suites = ipw_cipher_suites;
+ wdev->wiphy->n_cipher_suites = ARRAY_SIZE(ipw_cipher_suites);
+
set_wiphy_dev(wdev->wiphy, &priv->pci_dev->dev);
if (wiphy_register(wdev->wiphy)) {
ipw2100_down(priv);
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index 827889b..56bd370 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -34,6 +34,7 @@
#include <linux/slab.h>
#include <net/cfg80211-wext.h>
#include "ipw2200.h"
+#include "ipw.h"
#ifndef KBUILD_EXTMOD
@@ -11535,6 +11536,9 @@ static int ipw_wdev_init(struct net_device *dev)
wdev->wiphy->bands[IEEE80211_BAND_5GHZ] = a_band;
}
+ wdev->wiphy->cipher_suites = ipw_cipher_suites;
+ wdev->wiphy->n_cipher_suites = ARRAY_SIZE(ipw_cipher_suites);
+
set_wiphy_dev(wdev->wiphy, &priv->pci_dev->dev);
/* With that information in place, we can now register the wiphy... */
--
1.7.7.6
@@ -0,0 +1,47 @@
From ae8e28c4c0aaba535e88908a7a2c560bb55061f9 Mon Sep 17 00:00:00 2001
From: Eugeni Dodonov <eugeni.dodonov@intel.com>
Date: Tue, 14 Feb 2012 11:44:48 -0200
Subject: [PATCH 007/109] drm/i915: do not enable RC6p on Sandy Bridge
commit 1c8ecf80fdee4e7b23a9e7da7ff9bd59ba2dcf96 upstream.
With base on latest findings, RC6p seems to be respondible for RC6-related
issues on Sandy Bridge platform. To work-around those issues, the previous
solution was to completely disable RC6 on Sandy Bridge for the past few
releases, even if plain RC6 was not giving any issues.
What this patch does is preventing RC6p from being enabled on Sandy Bridge
even if users enable RC6 via a kernel parameter. So it won't change the
defaults in any way, but will ensure that if users do enable RC6 manually
it won't break their machines by enabling this extra state.
Proper fix for this (enabling specific RC6 states according to the GPU
generation) were proposed for the -next kernel, but we are too late in the
release process now to pick such changes.
Acked-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Eugeni Dodonov <eugeni.dodonov@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/gpu/drm/i915/intel_display.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 6aa7716..c63ca5f 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -8043,8 +8043,8 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv)
I915_WRITE(GEN6_RC6pp_THRESHOLD, 64000); /* unused */
if (intel_enable_rc6(dev_priv->dev))
- rc6_mask = GEN6_RC_CTL_RC6p_ENABLE |
- GEN6_RC_CTL_RC6_ENABLE;
+ rc6_mask = GEN6_RC_CTL_RC6_ENABLE |
+ (IS_GEN7(dev_priv->dev)) ? GEN6_RC_CTL_RC6p_ENABLE : 0;
I915_WRITE(GEN6_RC_CONTROL,
rc6_mask |
--
1.7.7.6
@@ -0,0 +1,34 @@
From 2c5df93ac56c5ec76f87a0daf418966abb25b03b Mon Sep 17 00:00:00 2001
From: Eugeni Dodonov <eugeni.dodonov@intel.com>
Date: Thu, 23 Feb 2012 23:57:06 -0200
Subject: [PATCH 008/109] drm/i915: fix operator precedence when enabling RC6p
commit c0e2ee1bc0cf82eec89e26b7afe7e4db0561b7d9 upstream.
As noticed by Torsten Kaiser, the operator precedence can play tricks with
us here.
CC: Dave Airlie <airlied@redhat.com>
Signed-off-by: Eugeni Dodonov <eugeni.dodonov@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/gpu/drm/i915/intel_display.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index c63ca5f..cc75c4b 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -8044,7 +8044,7 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv)
if (intel_enable_rc6(dev_priv->dev))
rc6_mask = GEN6_RC_CTL_RC6_ENABLE |
- (IS_GEN7(dev_priv->dev)) ? GEN6_RC_CTL_RC6p_ENABLE : 0;
+ ((IS_GEN7(dev_priv->dev)) ? GEN6_RC_CTL_RC6p_ENABLE : 0);
I915_WRITE(GEN6_RC_CONTROL,
rc6_mask |
--
1.7.7.6
@@ -0,0 +1,47 @@
From 6ad602fb1ad21f96e203b4525aa56c7e0cc6ac4f Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Tue, 17 Jan 2012 14:50:51 -0200
Subject: [PATCH 009/109] kbuild: do not check for ancient modutils tools
commit 620c231c7a7f48745094727bb612f6321cfc8844 upstream.
scripts/depmod.sh checks for the output of '-V' expecting that it has
module-init-tools in it. It's a hack to prevent users from using
modutils instead of module-init-tools, that only works with 2.4.x
kernels. This however prints an annoying warning for kmod tool, that is
currently replacing module-init-tools.
Rather than putting another check for kmod's version, just remove it
since users of 2.4.x kernel are unlikely to upgrade to 3.x, and if they
do, let depmod fail in that case because they should know what they are
doing.
Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Acked-by: WANG Cong <amwang@redhat.com>
Acked-By: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Michal Marek <mmarek@suse.cz>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
scripts/depmod.sh | 6 ------
1 files changed, 0 insertions(+), 6 deletions(-)
diff --git a/scripts/depmod.sh b/scripts/depmod.sh
index a272356..2ae4817 100755
--- a/scripts/depmod.sh
+++ b/scripts/depmod.sh
@@ -9,12 +9,6 @@ fi
DEPMOD=$1
KERNELRELEASE=$2
-if ! "$DEPMOD" -V 2>/dev/null | grep -q module-init-tools; then
- echo "Warning: you may need to install module-init-tools" >&2
- echo "See http://www.codemonkey.org.uk/docs/post-halloween-2.6.txt" >&2
- sleep 1
-fi
-
if ! test -r System.map -a -x "$DEPMOD"; then
exit 0
fi
--
1.7.7.6
@@ -0,0 +1,45 @@
From bc3f81b80966fcd6e91b61c76408eed675a1b364 Mon Sep 17 00:00:00 2001
From: Eldad Zack <eldad@fogrefinery.com>
Date: Sun, 22 Apr 2012 00:48:04 +0200
Subject: [PATCH 010/109] brcmsmac: "INTERMEDIATE but not AMPDU" only when
tracing
commit 6ead629b27269c553c9092c47cd8f5ab0309ee3b upstream.
I keep getting the following messages on the log buffer:
[ 2167.097507] ieee80211 phy0: brcms_c_dotxstatus: INTERMEDIATE but not AMPDU
[ 2281.331305] ieee80211 phy0: brcms_c_dotxstatus: INTERMEDIATE but not AMPDU
[ 2281.332539] ieee80211 phy0: brcms_c_dotxstatus: INTERMEDIATE but not AMPDU
[ 2329.876605] ieee80211 phy0: brcms_c_dotxstatus: INTERMEDIATE but not AMPDU
[ 2329.877354] ieee80211 phy0: brcms_c_dotxstatus: INTERMEDIATE but not AMPDU
[ 2462.280756] ieee80211 phy0: brcms_c_dotxstatus: INTERMEDIATE but not AMPDU
[ 2615.651689] ieee80211 phy0: brcms_c_dotxstatus: INTERMEDIATE but not AMPDU
From the code comment I understand that this something that can -
and does, quite frequently - happen.
Signed-off-by: Eldad Zack <eldad@fogrefinery.com>
Acked-by: Franky Lin<frankyl@broadcom.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/wireless/brcm80211/brcmsmac/main.c | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)
diff --git a/drivers/net/wireless/brcm80211/brcmsmac/main.c b/drivers/net/wireless/brcm80211/brcmsmac/main.c
index 833cbef..8a40ff9 100644
--- a/drivers/net/wireless/brcm80211/brcmsmac/main.c
+++ b/drivers/net/wireless/brcm80211/brcmsmac/main.c
@@ -900,8 +900,7 @@ brcms_c_dotxstatus(struct brcms_c_info *wlc, struct tx_status *txs)
*/
if (!(txs->status & TX_STATUS_AMPDU)
&& (txs->status & TX_STATUS_INTERMEDIATE)) {
- wiphy_err(wlc->wiphy, "%s: INTERMEDIATE but not AMPDU\n",
- __func__);
+ BCMMSG(wlc->wiphy, "INTERMEDIATE but not AMPDU\n");
return false;
}
--
1.7.7.6
@@ -0,0 +1,37 @@
From 9c6259f9486461da338a097ae1655d55b7bead3a Mon Sep 17 00:00:00 2001
From: William Dauchy <wdauchy@gmail.com>
Date: Wed, 14 Mar 2012 12:32:04 +0100
Subject: [PATCH 011/109] NFSv4: Rate limit the state manager for lock reclaim
warning messages
commit 96dcadc2fdd111dca90d559f189a30c65394451a upstream.
Adding rate limit on `Lock reclaim failed` messages since it could fill
up system logs
Signed-off-by: William Dauchy <wdauchy@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
[bwh: Backported to 3.2: add the 'NFS:' prefix at the same time]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/nfs/nfs4state.c | 5 +++--
1 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 66020ac..07354b7 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1186,8 +1186,9 @@ restart:
spin_lock(&state->state_lock);
list_for_each_entry(lock, &state->lock_states, ls_locks) {
if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
- printk("%s: Lock reclaim failed!\n",
- __func__);
+ pr_warn_ratelimited("NFS: "
+ "%s: Lock reclaim "
+ "failed!\n", __func__);
}
spin_unlock(&state->state_lock);
nfs4_put_open_state(state);
--
1.7.7.6
@@ -0,0 +1,32 @@
From 877ee75ef7f45fd1022c37f6a8a957e9d1b098b7 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Wed, 4 Jan 2012 21:22:51 -0500
Subject: [PATCH 012/109] ext4: Report max_batch_time option correctly
commit 1d526fc91bea04ee35b7599bf8b82f86c0aaf46c upstream.
Currently the value reported for max_batch_time is really the
value of min_batch_time.
Reported-by: Russell Coker <russell@coker.com.au>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/ext4/super.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ab7aa3f..a93486e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1097,7 +1097,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
}
if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
seq_printf(seq, ",max_batch_time=%u",
- (unsigned) sbi->s_min_batch_time);
+ (unsigned) sbi->s_max_batch_time);
}
/*
--
1.7.7.6
@@ -0,0 +1,464 @@
From 5babdc7487f6c78c06d8e085efe841d91a77ff48 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 21 Mar 2012 16:34:12 -0700
Subject: [PATCH 013/109] hugepages: fix use after free bug in "quota"
handling
commit 90481622d75715bfcb68501280a917dbfe516029 upstream.
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 3.2: adjust context to apply after commit
c50ac050811d6485616a193eb0f37bfbd191cc89 'hugetlb: fix resv_map leak in
error path', backported in 3.2.20]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/hugetlbfs/inode.c | 54 +++++++-----------
include/linux/hugetlb.h | 14 ++++--
mm/hugetlb.c | 135 +++++++++++++++++++++++++++++++++++++---------
3 files changed, 139 insertions(+), 64 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2d0ca24..ebc2f4d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -592,9 +592,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
spin_lock(&sbinfo->stat_lock);
/* If no limits set, just report 0 for max/free/used
* blocks, like simple_statfs() */
- if (sbinfo->max_blocks >= 0) {
- buf->f_blocks = sbinfo->max_blocks;
- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ if (sbinfo->spool) {
+ long free_pages;
+
+ spin_lock(&sbinfo->spool->lock);
+ buf->f_blocks = sbinfo->spool->max_hpages;
+ free_pages = sbinfo->spool->max_hpages
+ - sbinfo->spool->used_hpages;
+ buf->f_bavail = buf->f_bfree = free_pages;
+ spin_unlock(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
@@ -610,6 +616,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
if (sbi) {
sb->s_fs_info = NULL;
+
+ if (sbi->spool)
+ hugepage_put_subpool(sbi->spool);
+
kfree(sbi);
}
}
@@ -841,10 +851,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = sbinfo;
sbinfo->hstate = config.hstate;
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_blocks = config.nr_blocks;
- sbinfo->free_blocks = config.nr_blocks;
sbinfo->max_inodes = config.nr_inodes;
sbinfo->free_inodes = config.nr_inodes;
+ sbinfo->spool = NULL;
+ if (config.nr_blocks != -1) {
+ sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+ if (!sbinfo->spool)
+ goto out_free;
+ }
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = huge_page_size(config.hstate);
sb->s_blocksize_bits = huge_page_shift(config.hstate);
@@ -864,38 +878,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_root = root;
return 0;
out_free:
+ if (sbinfo->spool)
+ kfree(sbinfo->spool);
kfree(sbinfo);
return -ENOMEM;
}
-int hugetlb_get_quota(struct address_space *mapping, long delta)
-{
- int ret = 0;
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
- if (sbinfo->free_blocks > -1) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks - delta >= 0)
- sbinfo->free_blocks -= delta;
- else
- ret = -ENOMEM;
- spin_unlock(&sbinfo->stat_lock);
- }
-
- return ret;
-}
-
-void hugetlb_put_quota(struct address_space *mapping, long delta)
-{
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
- if (sbinfo->free_blocks > -1) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_blocks += delta;
- spin_unlock(&sbinfo->stat_lock);
- }
-}
-
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d9d6c86..c5ed2f1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -14,6 +14,15 @@ struct user_struct;
#include <linux/shm.h>
#include <asm/tlbflush.h>
+struct hugepage_subpool {
+ spinlock_t lock;
+ long count;
+ long max_hpages, used_hpages;
+};
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
+void hugepage_put_subpool(struct hugepage_subpool *spool);
+
int PageHuge(struct page *page);
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
@@ -138,12 +147,11 @@ struct hugetlbfs_config {
};
struct hugetlbfs_sb_info {
- long max_blocks; /* blocks allowed */
- long free_blocks; /* blocks free */
long max_inodes; /* inodes allowed */
long free_inodes; /* inodes free */
spinlock_t stat_lock;
struct hstate *hstate;
+ struct hugepage_subpool *spool;
};
@@ -166,8 +174,6 @@ extern const struct file_operations hugetlbfs_file_operations;
extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
struct user_struct **user, int creat_flags);
-int hugetlb_get_quota(struct address_space *mapping, long delta);
-void hugetlb_put_quota(struct address_space *mapping, long delta);
static inline int is_file_hugepages(struct file *file)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5f5c545..7c535b0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
*/
static DEFINE_SPINLOCK(hugetlb_lock);
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
+
+ spin_unlock(&spool->lock);
+
+ /* If no pages are used, and no other handles to the subpool
+ * remain, free the subpool the subpool remain */
+ if (free)
+ kfree(spool);
+}
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+ struct hugepage_subpool *spool;
+
+ spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ if (!spool)
+ return NULL;
+
+ spin_lock_init(&spool->lock);
+ spool->count = 1;
+ spool->max_hpages = nr_blocks;
+ spool->used_hpages = 0;
+
+ return spool;
+}
+
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+ spin_lock(&spool->lock);
+ BUG_ON(!spool->count);
+ spool->count--;
+ unlock_or_release_subpool(spool);
+}
+
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ int ret = 0;
+
+ if (!spool)
+ return 0;
+
+ spin_lock(&spool->lock);
+ if ((spool->used_hpages + delta) <= spool->max_hpages) {
+ spool->used_hpages += delta;
+ } else {
+ ret = -ENOMEM;
+ }
+ spin_unlock(&spool->lock);
+
+ return ret;
+}
+
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ if (!spool)
+ return;
+
+ spin_lock(&spool->lock);
+ spool->used_hpages -= delta;
+ /* If hugetlbfs_put_super couldn't free spool due to
+ * an outstanding quota reference, free it now. */
+ unlock_or_release_subpool(spool);
+}
+
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+ return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+ return subpool_inode(vma->vm_file->f_dentry->d_inode);
+}
+
/*
* Region tracking -- allows tracking of reservations and instantiated pages
* across the pages in a mapping.
@@ -533,9 +611,9 @@ static void free_huge_page(struct page *page)
*/
struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
- struct address_space *mapping;
+ struct hugepage_subpool *spool =
+ (struct hugepage_subpool *)page_private(page);
- mapping = (struct address_space *) page_private(page);
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
@@ -551,8 +629,7 @@ static void free_huge_page(struct page *page)
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
- if (mapping)
- hugetlb_put_quota(mapping, 1);
+ hugepage_subpool_put_pages(spool, 1);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -966,11 +1043,12 @@ static void return_unused_surplus_pages(struct hstate *h,
/*
* Determine if the huge page at addr within the vma has an associated
* reservation. Where it does not we will need to logically increase
- * reservation and actually increase quota before an allocation can occur.
- * Where any new reservation would be required the reservation change is
- * prepared, but not committed. Once the page has been quota'd allocated
- * an instantiated the change should be committed via vma_commit_reservation.
- * No action is required on failure.
+ * reservation and actually increase subpool usage before an allocation
+ * can occur. Where any new reservation would be required the
+ * reservation change is prepared, but not committed. Once the page
+ * has been allocated from the subpool and instantiated the change should
+ * be committed via vma_commit_reservation. No action is required on
+ * failure.
*/
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
@@ -1019,24 +1097,24 @@ static void vma_commit_reservation(struct hstate *h,
static struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
+ struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- struct address_space *mapping = vma->vm_file->f_mapping;
- struct inode *inode = mapping->host;
long chg;
/*
- * Processes that did not create the mapping will have no reserves and
- * will not have accounted against quota. Check that the quota can be
- * made before satisfying the allocation
- * MAP_NORESERVE mappings may also need pages and quota allocated
- * if no reserve mapping overlaps.
+ * Processes that did not create the mapping will have no
+ * reserves and will not have accounted against subpool
+ * limit. Check that the subpool limit can be made before
+ * satisfying the allocation MAP_NORESERVE mappings may also
+ * need pages and subpool limit allocated allocated if no reserve
+ * mapping overlaps.
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
return ERR_PTR(-VM_FAULT_OOM);
if (chg)
- if (hugetlb_get_quota(inode->i_mapping, chg))
+ if (hugepage_subpool_get_pages(spool, chg))
return ERR_PTR(-VM_FAULT_SIGBUS);
spin_lock(&hugetlb_lock);
@@ -1046,12 +1124,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
if (!page) {
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
- hugetlb_put_quota(inode->i_mapping, chg);
+ hugepage_subpool_put_pages(spool, chg);
return ERR_PTR(-VM_FAULT_SIGBUS);
}
}
- set_page_private(page, (unsigned long) mapping);
+ set_page_private(page, (unsigned long)spool);
vma_commit_reservation(h, vma, addr);
@@ -2081,6 +2159,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
struct resv_map *reservations = vma_resv_map(vma);
+ struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve;
unsigned long start;
unsigned long end;
@@ -2096,7 +2175,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
if (reserve) {
hugetlb_acct_memory(h, -reserve);
- hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+ hugepage_subpool_put_pages(spool, reserve);
}
}
}
@@ -2326,7 +2405,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
address = address & huge_page_mask(h);
pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
+ (vma->vm_pgoff >> PAGE_SHIFT);
- mapping = (struct address_space *)page_private(page);
+ mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
/*
* Take the mapping lock for the duration of the table walk. As
@@ -2865,11 +2944,12 @@ int hugetlb_reserve_pages(struct inode *inode,
{
long ret, chg;
struct hstate *h = hstate_inode(inode);
+ struct hugepage_subpool *spool = subpool_inode(inode);
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
- * and filesystem quota without using reserves
+ * without using reserves
*/
if (vm_flags & VM_NORESERVE)
return 0;
@@ -2898,19 +2978,19 @@ int hugetlb_reserve_pages(struct inode *inode,
goto out_err;
}
- /* There must be enough filesystem quota for the mapping */
- if (hugetlb_get_quota(inode->i_mapping, chg)) {
+ /* There must be enough pages in the subpool for the mapping */
+ if (hugepage_subpool_get_pages(spool, chg)) {
ret = -ENOSPC;
goto out_err;
}
/*
* Check enough hugepages are available for the reservation.
- * Hand back the quota if there are not
+ * Hand the pages back to the subpool if there are not
*/
ret = hugetlb_acct_memory(h, chg);
if (ret < 0) {
- hugetlb_put_quota(inode->i_mapping, chg);
+ hugepage_subpool_put_pages(spool, chg);
goto out_err;
}
@@ -2938,12 +3018,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
struct hstate *h = hstate_inode(inode);
long chg = region_truncate(&inode->i_mapping->private_list, offset);
+ struct hugepage_subpool *spool = subpool_inode(inode);
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
- hugetlb_put_quota(inode->i_mapping, (chg - freed));
+ hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -(chg - freed));
}
--
1.7.7.6
@@ -0,0 +1,69 @@
From e45792228b6a4487d859334c757322554c960397 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 7 Feb 2012 14:59:05 -0500
Subject: [PATCH 014/109] NFSv4: Reduce the footprint of the idmapper
commit d073e9b541e1ac3f52d72c3a153855d9a9ee3278 upstream.
Instead of pre-allocating the storage for all the strings, we can
significantly reduce the size of that table by doing the allocation
when we do the downcall.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
[bwh: Backported to 3.2: adjust context in nfs_idmap_delete()]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/nfs/idmap.c | 16 +++++++++++++---
1 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 47d1c6f..b8c41c3 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -318,7 +318,7 @@ struct idmap_hashent {
unsigned long ih_expires;
__u32 ih_id;
size_t ih_namelen;
- char ih_name[IDMAP_NAMESZ];
+ const char *ih_name;
};
struct idmap_hashtable {
@@ -382,11 +382,16 @@ void
nfs_idmap_delete(struct nfs_client *clp)
{
struct idmap *idmap = clp->cl_idmap;
+ int i;
if (!idmap)
return;
rpc_unlink(idmap->idmap_dentry);
clp->cl_idmap = NULL;
+ for (i = 0; i < ARRAY_SIZE(idmap->idmap_user_hash.h_entries); i++)
+ kfree(idmap->idmap_user_hash.h_entries[i].ih_name);
+ for (i = 0; i < ARRAY_SIZE(idmap->idmap_group_hash.h_entries); i++)
+ kfree(idmap->idmap_group_hash.h_entries[i].ih_name);
kfree(idmap);
}
@@ -449,9 +454,14 @@ static void
idmap_update_entry(struct idmap_hashent *he, const char *name,
size_t namelen, __u32 id)
{
+ char *str = kmalloc(namelen + 1, GFP_KERNEL);
+ if (str == NULL)
+ return;
+ kfree(he->ih_name);
he->ih_id = id;
- memcpy(he->ih_name, name, namelen);
- he->ih_name[namelen] = '\0';
+ memcpy(str, name, namelen);
+ str[namelen] = '\0';
+ he->ih_name = str;
he->ih_namelen = namelen;
he->ih_expires = jiffies + nfs_idmap_cache_timeout;
}
--
1.7.7.6
@@ -0,0 +1,131 @@
From c297b1ec9340ec265bceeb8c1b8198ee476f0573 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 8 Feb 2012 13:39:15 -0500
Subject: [PATCH 015/109] NFSv4: Further reduce the footprint of the idmapper
commit 685f50f9188ac1e8244d0340a9d6ea36b6136cec upstream.
Don't allocate the legacy idmapper tables until we actually need
them.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
[bwh: Backported to 3.2: adjust context in nfs_idmap_delete()]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/nfs/idmap.c | 42 ++++++++++++++++++++++++++++++++++++------
1 files changed, 36 insertions(+), 6 deletions(-)
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index b8c41c3..b122af8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -323,7 +323,7 @@ struct idmap_hashent {
struct idmap_hashtable {
__u8 h_type;
- struct idmap_hashent h_entries[IDMAP_HASH_SZ];
+ struct idmap_hashent *h_entries;
};
struct idmap {
@@ -378,20 +378,39 @@ nfs_idmap_new(struct nfs_client *clp)
return 0;
}
+static void
+idmap_alloc_hashtable(struct idmap_hashtable *h)
+{
+ if (h->h_entries != NULL)
+ return;
+ h->h_entries = kcalloc(IDMAP_HASH_SZ,
+ sizeof(*h->h_entries),
+ GFP_KERNEL);
+}
+
+static void
+idmap_free_hashtable(struct idmap_hashtable *h)
+{
+ int i;
+
+ if (h->h_entries == NULL)
+ return;
+ for (i = 0; i < IDMAP_HASH_SZ; i++)
+ kfree(h->h_entries[i].ih_name);
+ kfree(h->h_entries);
+}
+
void
nfs_idmap_delete(struct nfs_client *clp)
{
struct idmap *idmap = clp->cl_idmap;
- int i;
if (!idmap)
return;
rpc_unlink(idmap->idmap_dentry);
clp->cl_idmap = NULL;
- for (i = 0; i < ARRAY_SIZE(idmap->idmap_user_hash.h_entries); i++)
- kfree(idmap->idmap_user_hash.h_entries[i].ih_name);
- for (i = 0; i < ARRAY_SIZE(idmap->idmap_group_hash.h_entries); i++)
- kfree(idmap->idmap_group_hash.h_entries[i].ih_name);
+ idmap_free_hashtable(&idmap->idmap_user_hash);
+ idmap_free_hashtable(&idmap->idmap_group_hash);
kfree(idmap);
}
@@ -401,6 +420,8 @@ nfs_idmap_delete(struct nfs_client *clp)
static inline struct idmap_hashent *
idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len)
{
+ if (h->h_entries == NULL)
+ return NULL;
return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ];
}
@@ -409,6 +430,8 @@ idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len)
{
struct idmap_hashent *he = idmap_name_hash(h, name, len);
+ if (he == NULL)
+ return NULL;
if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0)
return NULL;
if (time_after(jiffies, he->ih_expires))
@@ -419,6 +442,8 @@ idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len)
static inline struct idmap_hashent *
idmap_id_hash(struct idmap_hashtable* h, __u32 id)
{
+ if (h->h_entries == NULL)
+ return NULL;
return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ];
}
@@ -426,6 +451,9 @@ static struct idmap_hashent *
idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
{
struct idmap_hashent *he = idmap_id_hash(h, id);
+
+ if (he == NULL)
+ return NULL;
if (he->ih_id != id || he->ih_namelen == 0)
return NULL;
if (time_after(jiffies, he->ih_expires))
@@ -441,12 +469,14 @@ idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
static inline struct idmap_hashent *
idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len)
{
+ idmap_alloc_hashtable(h);
return idmap_name_hash(h, name, len);
}
static inline struct idmap_hashent *
idmap_alloc_id(struct idmap_hashtable *h, __u32 id)
{
+ idmap_alloc_hashtable(h);
return idmap_id_hash(h, id);
}
--
1.7.7.6
@@ -0,0 +1,67 @@
From 6c4e2ff1b19d58c2a2f016d25b96eee0f733d7aa Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 2 May 2012 11:41:30 +0800
Subject: [PATCH 016/109] macvtap: zerocopy: fix offset calculation when
building skb
commit 3afc9621f15701c557e60f61eba9242bac2771dd upstream.
This patch fixes the offset calculation when building skb:
- offset1 were used as skb data offset not vector offset
- reset offset to zero only when we advance to next vector
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/macvtap.c | 13 +++++++------
1 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 1b7082d..4505008 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -504,10 +504,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
if (copy > size) {
++from;
--count;
- }
+ offset = 0;
+ } else
+ offset += size;
copy -= size;
offset1 += size;
- offset = 0;
}
if (len == offset1)
@@ -518,13 +519,13 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
int num_pages;
unsigned long base;
- len = from->iov_len - offset1;
+ len = from->iov_len - offset;
if (!len) {
- offset1 = 0;
+ offset = 0;
++from;
continue;
}
- base = (unsigned long)from->iov_base + offset1;
+ base = (unsigned long)from->iov_base + offset;
size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
num_pages = get_user_pages_fast(base, size, 0, &page[i]);
if ((num_pages != size) ||
@@ -545,7 +546,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
len -= size;
i++;
}
- offset1 = 0;
+ offset = 0;
++from;
}
return 0;
--
1.7.7.6
@@ -0,0 +1,46 @@
From e2261c8945dd5af5a0627ac72f7a39f676f3b657 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 2 May 2012 11:41:44 +0800
Subject: [PATCH 017/109] macvtap: zerocopy: fix truesize underestimation
commit 4ef67ebedffa44ed9939b34708ac2fee06d2f65f upstream.
As the skb fragment were pinned/built from user pages, we should
account the page instead of length for truesize.
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/macvtap.c | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 4505008..c7a84eb 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -518,6 +518,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
struct page *page[MAX_SKB_FRAGS];
int num_pages;
unsigned long base;
+ unsigned long truesize;
len = from->iov_len - offset;
if (!len) {
@@ -532,10 +533,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
(num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
/* put_page is in skb free */
return -EFAULT;
+ truesize = size * PAGE_SIZE;
skb->data_len += len;
skb->len += len;
- skb->truesize += len;
- atomic_add(len, &skb->sk->sk_wmem_alloc);
+ skb->truesize += truesize;
+ atomic_add(truesize, &skb->sk->sk_wmem_alloc);
while (len) {
int off = base & ~PAGE_MASK;
int size = min_t(int, len, PAGE_SIZE - off);
--
1.7.7.6
@@ -0,0 +1,40 @@
From 13d71d7ee644607d525480330c8b6a4268c18c0e Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 2 May 2012 11:41:58 +0800
Subject: [PATCH 018/109] macvtap: zerocopy: put page when fail to get all
requested user pages
commit 02ce04bb3d28c3333231f43bca677228dbc686fe upstream.
When get_user_pages_fast() fails to get all requested pages, we could not use
kfree_skb() to free it as it has not been put in the skb fragments. So we need
to call put_page() instead.
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/macvtap.c | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index c7a84eb..f217247 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -530,9 +530,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
num_pages = get_user_pages_fast(base, size, 0, &page[i]);
if ((num_pages != size) ||
- (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
- /* put_page is in skb free */
+ (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) {
+ for (i = 0; i < num_pages; i++)
+ put_page(page[i]);
return -EFAULT;
+ }
truesize = size * PAGE_SIZE;
skb->data_len += len;
skb->len += len;
--
1.7.7.6
@@ -0,0 +1,53 @@
From c57df8c118c4c11ef6023034aa454636dd8780db Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 2 May 2012 11:42:06 +0800
Subject: [PATCH 019/109] macvtap: zerocopy: set SKBTX_DEV_ZEROCOPY only when
skb is built successfully
commit 01d6657b388438def19c8baaea28e742b6ed32ec upstream.
Current the SKBTX_DEV_ZEROCOPY is set unconditionally after
zerocopy_sg_from_iovec(), this would lead NULL pointer when macvtap
fails to build zerocopy skb because destructor_arg was not
initialized. Solve this by set this flag after the skb were built
successfully.
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/macvtap.c | 9 +++++----
1 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index f217247..7fecd66 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -698,10 +698,9 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
if (!skb)
goto err;
- if (zerocopy) {
+ if (zerocopy)
err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
- skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
- } else
+ else
err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
len);
if (err)
@@ -720,8 +719,10 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
rcu_read_lock_bh();
vlan = rcu_dereference_bh(q->vlan);
/* copy skb_ubuf_info for callback when skb has no error */
- if (zerocopy)
+ if (zerocopy) {
skb_shinfo(skb)->destructor_arg = m->msg_control;
+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+ }
if (vlan)
macvlan_start_xmit(skb, vlan->dev);
else
--
1.7.7.6
@@ -0,0 +1,84 @@
From c93ad33631e3efbb6f02f24c6b6817227b2c9252 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 2 May 2012 11:42:15 +0800
Subject: [PATCH 020/109] macvtap: zerocopy: validate vectors before building
skb
commit b92946e2919134ebe2a4083e4302236295ea2a73 upstream.
There're several reasons that the vectors need to be validated:
- Return error when caller provides vectors whose num is greater than UIO_MAXIOV.
- Linearize part of skb when userspace provides vectors grater than MAX_SKB_FRAGS.
- Return error when userspace provides vectors whose total length may exceed
- MAX_SKB_FRAGS * PAGE_SIZE.
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/macvtap.c | 25 +++++++++++++++++++++----
1 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 7fecd66..26106c0 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -528,9 +528,10 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
}
base = (unsigned long)from->iov_base + offset;
size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (i + size > MAX_SKB_FRAGS)
+ return -EMSGSIZE;
num_pages = get_user_pages_fast(base, size, 0, &page[i]);
- if ((num_pages != size) ||
- (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) {
+ if (num_pages != size) {
for (i = 0; i < num_pages; i++)
put_page(page[i]);
return -EFAULT;
@@ -650,7 +651,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
int err;
struct virtio_net_hdr vnet_hdr = { 0 };
int vnet_hdr_len = 0;
- int copylen;
+ int copylen = 0;
bool zerocopy = false;
if (q->flags & IFF_VNET_HDR) {
@@ -679,15 +680,31 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
if (unlikely(len < ETH_HLEN))
goto err;
+ err = -EMSGSIZE;
+ if (unlikely(count > UIO_MAXIOV))
+ goto err;
+
if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY))
zerocopy = true;
if (zerocopy) {
+ /* Userspace may produce vectors with count greater than
+ * MAX_SKB_FRAGS, so we need to linearize parts of the skb
+ * to let the rest of data to be fit in the frags.
+ */
+ if (count > MAX_SKB_FRAGS) {
+ copylen = iov_length(iv, count - MAX_SKB_FRAGS);
+ if (copylen < vnet_hdr_len)
+ copylen = 0;
+ else
+ copylen -= vnet_hdr_len;
+ }
/* There are 256 bytes to be copied in skb, so there is enough
* room for skb expand head in case it is used.
* The rest buffer is mapped from userspace.
*/
- copylen = vnet_hdr.hdr_len;
+ if (copylen < vnet_hdr.hdr_len)
+ copylen = vnet_hdr.hdr_len;
if (!copylen)
copylen = GOODCOPY_LEN;
} else
--
1.7.7.6
@@ -0,0 +1,35 @@
From 274c1b4b54a12df73eb5fc2763a294ff2a04669c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 22 Apr 2012 17:02:11 +0300
Subject: [PATCH 021/109] KVM: Fix buffer overflow in kvm_set_irq()
commit f2ebd422f71cda9c791f76f85d2ca102ae34a1ed upstream.
kvm_set_irq() has an internal buffer of three irq routing entries, allowing
connecting a GSI to three IRQ chips or on MSI. However setup_routing_entry()
does not properly enforce this, allowing three irqchip routes followed by
an MSI route to overflow the buffer.
Fix by ensuring that an MSI entry is added to an empty list.
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
virt/kvm/irq_comm.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 9f614b4..272407c 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -318,6 +318,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
*/
hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
if (ei->type == KVM_IRQ_ROUTING_MSI ||
+ ue->type == KVM_IRQ_ROUTING_MSI ||
ue->u.irqchip.irqchip == ei->irqchip.irqchip)
return r;
--
1.7.7.6
@@ -0,0 +1,54 @@
From d0ef0e99137f622218e0395ccc12210e4804b5ed Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 15 Jun 2012 12:52:46 +0200
Subject: [PATCH 022/109] scsi: Silence unnecessary warnings about ioctl to
partition
commit 6d9359280753d2955f86d6411047516a9431eb51 upstream.
Sometimes, warnings about ioctls to partition happen often enough that they
form majority of the warnings in the kernel log and users complain. In some
cases warnings are about ioctls such as SG_IO so it's not good to get rid of
the warnings completely as they can ease debugging of userspace problems
when ioctl is refused.
Since I have seen warnings from lots of commands, including some proprietary
userspace applications, I don't think disallowing the ioctls for processes
with CAP_SYS_RAWIO will happen in the near future if ever. So lets just
stop warning for processes with CAP_SYS_RAWIO for which ioctl is allowed.
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: James Bottomley <JBottomley@parallels.com>
CC: linux-scsi@vger.kernel.org
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
[bwh: Backported to 3.2: use ENOTTY, not ENOIOCTLCMD]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
block/scsi_ioctl.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 688be8a..9e76a32 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -721,11 +721,14 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
break;
}
+ if (capable(CAP_SYS_RAWIO))
+ return 0;
+
/* In particular, rule out all resets and host-specific ioctls. */
printk_ratelimited(KERN_WARNING
"%s: sending ioctl %x to a partition!\n", current->comm, cmd);
- return capable(CAP_SYS_RAWIO) ? 0 : -ENOTTY;
+ return -ENOTTY;
}
EXPORT_SYMBOL(scsi_verify_blk_ioctl);
--
1.7.7.6
@@ -0,0 +1,43 @@
From 4255dce0ae728fe63f19ded56b5dc2c324d6f18d Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Wed, 25 Apr 2012 16:01:47 -0700
Subject: [PATCH 023/109] epoll: clear the tfile_check_list on -ELOOP
commit 13d518074a952d33d47c428419693f63389547e9 upstream.
An epoll_ctl(,EPOLL_CTL_ADD,,) operation can return '-ELOOP' to prevent
circular epoll dependencies from being created. However, in that case we
do not properly clear the 'tfile_check_list'. Thus, add a call to
clear_tfile_check_list() for the -ELOOP case.
Signed-off-by: Jason Baron <jbaron@redhat.com>
Reported-by: Yurij M. Plotnikov <Yurij.Plotnikov@oktetlabs.ru>
Cc: Nelson Elhage <nelhage@nelhage.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Tested-by: Alexandra N. Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/eventpoll.c | 4 +++-
1 files changed, 3 insertions(+), 1 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4d9d3a4..a6f3763 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1629,8 +1629,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
if (op == EPOLL_CTL_ADD) {
if (is_file_epoll(tfile)) {
error = -ELOOP;
- if (ep_loop_check(ep, tfile) != 0)
+ if (ep_loop_check(ep, tfile) != 0) {
+ clear_tfile_check_list();
goto error_tgt_fput;
+ }
} else
list_add(&tfile->f_tfile_llink, &tfile_check_list);
}
--
1.7.7.6
@@ -0,0 +1,45 @@
From be1ab01a00ec19b273050ad8f3fbb9472238b026 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuah.khan@hp.com>
Date: Wed, 6 Jun 2012 10:50:06 -0600
Subject: [PATCH 024/109] iommu/amd: Fix missing iommu_shutdown initialization
in passthrough mode
commit f2f12b6fc032c7b1419fd6db84e2868b5f05a878 upstream.
The iommu_shutdown callback is not initialized when the AMD
IOMMU driver runs in passthrough mode. Fix that by moving
the callback initialization before the check for
passthrough mode.
Signed-off-by: Shuah Khan <shuah.khan@hp.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
[bwh: Backported to 3.2: adjust context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/iommu/amd_iommu_init.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 6269eb0..ef2d493 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -1468,6 +1468,8 @@ static int __init amd_iommu_init(void)
register_syscore_ops(&amd_iommu_syscore_ops);
+ x86_platform.iommu_shutdown = disable_iommus;
+
if (iommu_pass_through)
goto out;
@@ -1476,7 +1478,6 @@ static int __init amd_iommu_init(void)
else
printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
- x86_platform.iommu_shutdown = disable_iommus;
out:
return ret;
--
1.7.7.6
@@ -0,0 +1,53 @@
From 60eec119060fbd35f569fd77fd448dbcd8d8f011 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jun 2012 14:52:40 +0200
Subject: [PATCH 025/109] iommu/amd: Initialize dma_ops for hotplug and sriov
devices
commit ac1534a55d1e87d59a21c09c570605933b551480 upstream.
When a device is added to the system at runtime the AMD
IOMMU driver initializes the necessary data structures to
handle translation for it. But it forgets to change the
per-device dma_ops to point to the AMD IOMMU driver. So
mapping actually never happens and all DMA accesses end in
an IO_PAGE_FAULT. Fix this.
Reported-by: Stefan Assmann <sassmann@redhat.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
[bwh: Backported to 3.2:
- Adjust context
- Use global iommu_pass_through; there is no per-device pass_through]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/iommu/amd_iommu.c | 7 +++++++
1 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index f1d5408..a1b8caa 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -59,6 +59,8 @@ static struct protection_domain *pt_domain;
static struct iommu_ops amd_iommu_ops;
+static struct dma_map_ops amd_iommu_dma_ops;
+
/*
* general struct to manage commands send to an IOMMU
*/
@@ -1878,6 +1880,11 @@ static int device_change_notifier(struct notifier_block *nb,
list_add_tail(&dma_domain->list, &iommu_pd_list);
spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+ if (!iommu_pass_through)
+ dev->archdata.dma_ops = &amd_iommu_dma_ops;
+ else
+ dev->archdata.dma_ops = &nommu_dma_ops;
+
break;
case BUS_NOTIFY_DEL_DEVICE:
--
1.7.7.6
@@ -0,0 +1,170 @@
From 6455e1a5be4eb27a1587322e0ed3a7ff6c3d8e4f Mon Sep 17 00:00:00 2001
From: Stanislaw Ledwon <staszek.ledwon@linux.jf.intel.com>
Date: Mon, 18 Jun 2012 15:20:00 +0200
Subject: [PATCH 026/109] usb: Add support for root hub port status CAS
commit 8bea2bd37df08aaa599aa361a9f8b836ba98e554 upstream.
The host controller port status register supports CAS (Cold Attach
Status) bit. This bit could be set when USB3.0 device is connected
when system is in Sx state. When the system wakes to S0 this port
status with CAS bit is reported and this port can't be used by any
device.
When CAS bit is set the port should be reset by warm reset. This
was not supported by xhci driver.
The issue was found when pendrive was connected to suspended
platform. The link state of "Compliance Mode" was reported together
with CAS bit. This link state was also not supported by xhci and
core/hub.c.
The CAS bit is defined only for xhci root hub port and it is
not supported on regular hubs. The link status is used to force
warm reset on port. Make the USB core issue a warm reset when port
is in ether the 'inactive' or 'compliance mode'. Change the xHCI driver
to report 'compliance mode' when the CAS is set. This force warm reset
on the root hub port.
This patch should be backported to stable kernels as old as 3.2, that
contain the commit 10d674a82e553cb8a1f41027bb3c3e309b3f6804 "USB: When
hot reset for USB3 fails, try warm reset."
Signed-off-by: Stanislaw Ledwon <staszek.ledwon@linux.intel.com>
Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Acked-by: Andiry Xu <andiry.xu@amd.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/usb/core/hub.c | 18 +++++++++-------
drivers/usb/host/xhci-hub.c | 44 +++++++++++++++++++++++++++++++++++++-----
drivers/usb/host/xhci.h | 6 ++++-
3 files changed, 53 insertions(+), 15 deletions(-)
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 50cf41a..2bc736f 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2039,12 +2039,16 @@ static unsigned hub_is_wusb(struct usb_hub *hub)
static int hub_port_reset(struct usb_hub *hub, int port1,
struct usb_device *udev, unsigned int delay, bool warm);
-/* Is a USB 3.0 port in the Inactive state? */
-static bool hub_port_inactive(struct usb_hub *hub, u16 portstatus)
+/* Is a USB 3.0 port in the Inactive or Complinance Mode state?
+ * Port worm reset is required to recover
+ */
+static bool hub_port_warm_reset_required(struct usb_hub *hub, u16 portstatus)
{
return hub_is_superspeed(hub->hdev) &&
- (portstatus & USB_PORT_STAT_LINK_STATE) ==
- USB_SS_PORT_LS_SS_INACTIVE;
+ (((portstatus & USB_PORT_STAT_LINK_STATE) ==
+ USB_SS_PORT_LS_SS_INACTIVE) ||
+ ((portstatus & USB_PORT_STAT_LINK_STATE) ==
+ USB_SS_PORT_LS_COMP_MOD)) ;
}
static int hub_port_wait_reset(struct usb_hub *hub, int port1,
@@ -2080,7 +2084,7 @@ static int hub_port_wait_reset(struct usb_hub *hub, int port1,
*
* See https://bugzilla.kernel.org/show_bug.cgi?id=41752
*/
- if (hub_port_inactive(hub, portstatus)) {
+ if (hub_port_warm_reset_required(hub, portstatus)) {
int ret;
if ((portchange & USB_PORT_STAT_C_CONNECTION))
@@ -3646,9 +3650,7 @@ static void hub_events(void)
/* Warm reset a USB3 protocol port if it's in
* SS.Inactive state.
*/
- if (hub_is_superspeed(hub->hdev) &&
- (portstatus & USB_PORT_STAT_LINK_STATE)
- == USB_SS_PORT_LS_SS_INACTIVE) {
+ if (hub_port_warm_reset_required(hub, portstatus)) {
dev_dbg(hub_dev, "warm reset port %d\n", i);
hub_port_reset(hub, i, NULL,
HUB_BH_RESET_TIME, true);
diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index a8b2980..fd8a2c2 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -438,6 +438,42 @@ void xhci_test_and_clear_bit(struct xhci_hcd *xhci, __le32 __iomem **port_array,
}
}
+/* Updates Link Status for super Speed port */
+static void xhci_hub_report_link_state(u32 *status, u32 status_reg)
+{
+ u32 pls = status_reg & PORT_PLS_MASK;
+
+ /* resume state is a xHCI internal state.
+ * Do not report it to usb core.
+ */
+ if (pls == XDEV_RESUME)
+ return;
+
+ /* When the CAS bit is set then warm reset
+ * should be performed on port
+ */
+ if (status_reg & PORT_CAS) {
+ /* The CAS bit can be set while the port is
+ * in any link state.
+ * Only roothubs have CAS bit, so we
+ * pretend to be in compliance mode
+ * unless we're already in compliance
+ * or the inactive state.
+ */
+ if (pls != USB_SS_PORT_LS_COMP_MOD &&
+ pls != USB_SS_PORT_LS_SS_INACTIVE) {
+ pls = USB_SS_PORT_LS_COMP_MOD;
+ }
+ /* Return also connection bit -
+ * hub state machine resets port
+ * when this bit is set.
+ */
+ pls |= USB_PORT_STAT_CONNECTION;
+ }
+ /* update status field */
+ *status |= pls;
+}
+
int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
u16 wIndex, char *buf, u16 wLength)
{
@@ -579,13 +615,9 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
else
status |= USB_PORT_STAT_POWER;
}
- /* Port Link State */
+ /* Update Port Link State for super speed ports*/
if (hcd->speed == HCD_USB3) {
- /* resume state is a xHCI internal state.
- * Do not report it to usb core.
- */
- if ((temp & PORT_PLS_MASK) != XDEV_RESUME)
- status |= (temp & PORT_PLS_MASK);
+ xhci_hub_report_link_state(&status, temp);
}
if (bus_state->port_c_suspend & (1 << wIndex))
status |= 1 << USB_PORT_FEAT_C_SUSPEND;
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 363b141..7a56805 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -341,7 +341,11 @@ struct xhci_op_regs {
#define PORT_PLC (1 << 22)
/* port configure error change - port failed to configure its link partner */
#define PORT_CEC (1 << 23)
-/* bit 24 reserved */
+/* Cold Attach Status - xHC can set this bit to report device attached during
+ * Sx state. Warm port reset should be perfomed to clear this bit and move port
+ * to connected state.
+ */
+#define PORT_CAS (1 << 24)
/* wake on connect (enable) */
#define PORT_WKCONN_E (1 << 25)
/* wake on disconnect (enable) */
--
1.7.7.6
@@ -0,0 +1,40 @@
From 5cdc2897a0bb7b11585d5b14eb3f2faa1505348c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Sat, 9 Jun 2012 11:07:56 +0800
Subject: [PATCH 027/109] gpiolib: wm8994: Pay attention to the value set when
enabling as output
commit 8cd578b6e28693f357867a77598a88ef3deb6b39 upstream.
Not paying attention to the value being set is a bad thing because it
means that we'll not set the hardware up to reflect what was requested.
Not setting the hardware up to reflect what was requested means that the
caller won't get the results they wanted.
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/gpio/gpio-wm8994.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)
diff --git a/drivers/gpio/gpio-wm8994.c b/drivers/gpio/gpio-wm8994.c
index 96198f3..a2da8f2 100644
--- a/drivers/gpio/gpio-wm8994.c
+++ b/drivers/gpio/gpio-wm8994.c
@@ -89,8 +89,11 @@ static int wm8994_gpio_direction_out(struct gpio_chip *chip,
struct wm8994_gpio *wm8994_gpio = to_wm8994_gpio(chip);
struct wm8994 *wm8994 = wm8994_gpio->wm8994;
+ if (value)
+ value = WM8994_GPN_LVL;
+
return wm8994_set_bits(wm8994, WM8994_GPIO_1 + offset,
- WM8994_GPN_DIR, 0);
+ WM8994_GPN_DIR | WM8994_GPN_LVL, value);
}
static void wm8994_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
--
1.7.7.6
@@ -0,0 +1,462 @@
From a7d3f237430003ca8d32d1703770f04d32a02b27 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 22 Jun 2012 15:52:09 +0200
Subject: [PATCH 028/109] sched/nohz: Rewrite and fix load-avg computation --
again
commit 5167e8d5417bf5c322a703d2927daec727ea40dd upstream.
Thanks to Charles Wang for spotting the defects in the current code:
- If we go idle during the sample window -- after sampling, we get a
negative bias because we can negate our own sample.
- If we wake up during the sample window we get a positive bias
because we push the sample to a known active period.
So rewrite the entire nohz load-avg muck once again, now adding
copious documentation to the code.
Reported-and-tested-by: Doug Smythies <dsmythies@telus.net>
Reported-and-tested-by: Charles Wang <muming.wq@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1340373782.18025.74.camel@twins
[ minor edits ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[bwh: Backported to 3.2: adjust filenames, context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
include/linux/sched.h | 8 ++
kernel/sched.c | 276 ++++++++++++++++++++++++++++++++++------------
kernel/sched_idletask.c | 1 -
kernel/time/tick-sched.c | 2 +
4 files changed, 213 insertions(+), 74 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c4f3e9..5afa2a3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1892,6 +1892,14 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
}
#endif
+#ifdef CONFIG_NO_HZ
+void calc_load_enter_idle(void);
+void calc_load_exit_idle(void);
+#else
+static inline void calc_load_enter_idle(void) { }
+static inline void calc_load_exit_idle(void) { }
+#endif /* CONFIG_NO_HZ */
+
#ifndef CONFIG_CPUMASK_OFFSTACK
static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
{
diff --git a/kernel/sched.c b/kernel/sched.c
index 576a27f..52ac69b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1885,7 +1885,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
#endif
-static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
static void update_cpu_load(struct rq *this_rq);
@@ -3401,11 +3400,73 @@ unsigned long this_cpu_load(void)
}
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of cpus, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-cpu deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * cpu to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ * this would add another cross-cpu cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever cpu the task ran
+ * when it went into uninterruptible state and decrement on whatever cpu
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all cpus yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
/* Variables and functions for calc_load */
static atomic_long_t calc_load_tasks;
static unsigned long calc_load_update;
unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
static long calc_load_fold_active(struct rq *this_rq)
{
@@ -3422,6 +3483,9 @@ static long calc_load_fold_active(struct rq *this_rq)
return delta;
}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
@@ -3433,30 +3497,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
#ifdef CONFIG_NO_HZ
/*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two idle-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old idle contribution in this window while
+ * accumlating the new one.
+ *
+ * - When we wake up from NO_HZ idle during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this cpu (effectively using the idle-delta for this cpu which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a cpu having been in NOHZ idle for multiple
+ * LOAD_FREQ intervals.
*
* When making the ILB scale, we should try to pull this in as well.
*/
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
-static void calc_load_account_idle(struct rq *this_rq)
+static inline int calc_load_write_idx(void)
{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
+
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next idle-delta.
+ */
+ if (!time_before(jiffies, calc_load_update))
+ idx++;
+
+ return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+ return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+ struct rq *this_rq = this_rq();
long delta;
+ /*
+ * We're going into NOHZ mode, if there's any pending delta, fold it
+ * into the pending idle delta.
+ */
delta = calc_load_fold_active(this_rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks_idle);
+ if (delta) {
+ int idx = calc_load_write_idx();
+ atomic_long_add(delta, &calc_load_idle[idx]);
+ }
}
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
{
- long delta = 0;
+ struct rq *this_rq = this_rq();
+
+ /*
+ * If we're still before the sample window, we're done.
+ */
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
/*
- * Its got a race, we don't care...
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
*/
- if (atomic_long_read(&calc_load_tasks_idle))
- delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+ this_rq->calc_load_update = calc_load_update;
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_idle[idx]))
+ delta = atomic_long_xchg(&calc_load_idle[idx], 0);
return delta;
}
@@ -3542,66 +3694,39 @@ static void calc_global_nohz(void)
{
long delta, active, n;
- /*
- * If we crossed a calc_load_update boundary, make sure to fold
- * any pending idle changes, the respective CPUs might have
- * missed the tick driven calc_load_account_active() update
- * due to NO_HZ.
- */
- delta = calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- /*
- * It could be the one fold was all it took, we done!
- */
- if (time_before(jiffies, calc_load_update + 10))
- return;
-
- /*
- * Catch-up, fold however many we are behind still
- */
- delta = jiffies - calc_load_update - 10;
- n = 1 + (delta / LOAD_FREQ);
+ if (!time_before(jiffies, calc_load_update + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
- calc_load_update += n * LOAD_FREQ;
-}
-#else
-static void calc_load_account_idle(struct rq *this_rq)
-{
-}
+ calc_load_update += n * LOAD_FREQ;
+ }
-static inline long calc_load_fold_idle(void)
-{
- return 0;
+ /*
+ * Flip the idle index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
}
+#else /* !CONFIG_NO_HZ */
-static void calc_global_nohz(void)
-{
-}
-#endif
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
+#endif /* CONFIG_NO_HZ */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
@@ -3609,11 +3734,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
*/
void calc_global_load(unsigned long ticks)
{
- long active;
+ long active, delta;
if (time_before(jiffies, calc_load_update + 10))
return;
+ /*
+ * Fold the 'old' idle-delta to include all NO_HZ cpus.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
active = atomic_long_read(&calc_load_tasks);
active = active > 0 ? active * FIXED_1 : 0;
@@ -3624,12 +3756,7 @@ void calc_global_load(unsigned long ticks)
calc_load_update += LOAD_FREQ;
/*
- * Account one period with whatever state we found before
- * folding in the nohz state and ageing the entire idle period.
- *
- * This avoids loosing a sample when we go idle between
- * calc_load_account_active() (10 ticks ago) and now and thus
- * under-accounting.
+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
*/
calc_global_nohz();
}
@@ -3646,7 +3773,6 @@ static void calc_load_account_active(struct rq *this_rq)
return;
delta = calc_load_fold_active(this_rq);
- delta += calc_load_fold_idle();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
@@ -3654,6 +3780,10 @@ static void calc_load_account_active(struct rq *this_rq)
}
/*
+ * End of global load-average stuff
+ */
+
+/*
* The exact cpuload at various idx values, calculated at every tick would be
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
*
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 0a51882..be92bfe 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -23,7 +23,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
- calc_load_account_idle(rq);
return rq->idle;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c923640..9955ebd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -430,6 +430,7 @@ void tick_nohz_stop_sched_tick(int inidle)
*/
if (!ts->tick_stopped) {
select_nohz_load_balancer(1);
+ calc_load_enter_idle();
ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
@@ -563,6 +564,7 @@ void tick_nohz_restart_sched_tick(void)
account_idle_ticks(ticks);
#endif
+ calc_load_exit_idle();
touch_softlockup_watchdog();
/*
* Cancel the scheduled timer and restore the tick
--
1.7.7.6
@@ -0,0 +1,70 @@
From 4090ab847de2c528ae152e864a7ce604ef300837 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Mon, 2 Jul 2012 19:53:55 +0200
Subject: [PATCH 029/109] USB: option: add ZTE MF60
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
commit 8e16e33c168a6efd0c9f7fa9dd4c1e1db9a74553 upstream.
Switches into a composite device by ejecting the initial
driver CD. The four interfaces are: QCDM, AT, QMI/wwan
and mass storage. Let this driver manage the two serial
interfaces:
T: Bus=02 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 28 Spd=480 MxCh= 0
D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1
P: Vendor=19d2 ProdID=1402 Rev= 0.00
S: Manufacturer=ZTE,Incorporated
S: Product=ZTE WCDMA Technologies MSM
S: SerialNumber=xxxxx
C:* #Ifs= 4 Cfg#= 1 Atr=c0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms
I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms
E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms
I:* If#= 3 Alt= 0 #EPs= 2 Cls=08(stor.) Sub=06 Prot=50 Driver=usb-storage
E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/usb/serial/option.c | 6 ++++++
1 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 21a4734..5960c7b 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -553,6 +553,10 @@ static const struct option_blacklist_info net_intf1_blacklist = {
.reserved = BIT(1),
};
+static const struct option_blacklist_info net_intf2_blacklist = {
+ .reserved = BIT(2),
+};
+
static const struct option_blacklist_info net_intf3_blacklist = {
.reserved = BIT(3),
};
@@ -1093,6 +1097,8 @@ static const struct usb_device_id option_ids[] = {
{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1298, 0xff, 0xff, 0xff) },
{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1299, 0xff, 0xff, 0xff) },
{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1300, 0xff, 0xff, 0xff) },
+ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1402, 0xff, 0xff, 0xff),
+ .driver_info = (kernel_ulong_t)&net_intf2_blacklist },
{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x2002, 0xff,
0xff, 0xff), .driver_info = (kernel_ulong_t)&zte_k3765_z_blacklist },
{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x2003, 0xff, 0xff, 0xff) },
--
1.7.7.6
@@ -0,0 +1,55 @@
From 26b05210d9f77b9b92fb12a73da5b9f6cb1b3f07 Mon Sep 17 00:00:00 2001
From: Gaosen Zhang <gaosen.zhang@mediatek.com>
Date: Thu, 5 Jul 2012 21:49:00 +0800
Subject: [PATCH 030/109] USB: option: Add MEDIATEK product ids
commit aacef9c561a693341566a6850c451ce3df68cb9a upstream.
Signed-off-by: Gaosen Zhang <gaosen.zhang@mediatek.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/usb/serial/option.c | 20 ++++++++++++++++++++
1 files changed, 20 insertions(+), 0 deletions(-)
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 5960c7b..5971c95 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -496,6 +496,15 @@ static void option_instat_callback(struct urb *urb);
/* MediaTek products */
#define MEDIATEK_VENDOR_ID 0x0e8d
+#define MEDIATEK_PRODUCT_DC_1COM 0x00a0
+#define MEDIATEK_PRODUCT_DC_4COM 0x00a5
+#define MEDIATEK_PRODUCT_DC_5COM 0x00a4
+#define MEDIATEK_PRODUCT_7208_1COM 0x7101
+#define MEDIATEK_PRODUCT_7208_2COM 0x7102
+#define MEDIATEK_PRODUCT_FP_1COM 0x0003
+#define MEDIATEK_PRODUCT_FP_2COM 0x0023
+#define MEDIATEK_PRODUCT_FPDC_1COM 0x0043
+#define MEDIATEK_PRODUCT_FPDC_2COM 0x0033
/* Cellient products */
#define CELLIENT_VENDOR_ID 0x2692
@@ -1240,6 +1249,17 @@ static const struct usb_device_id option_ids[] = {
{ USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a1, 0xff, 0x02, 0x01) },
{ USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a2, 0xff, 0x00, 0x00) },
{ USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a2, 0xff, 0x02, 0x01) }, /* MediaTek MT6276M modem & app port */
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_1COM, 0x0a, 0x00, 0x00) },
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_5COM, 0xff, 0x02, 0x01) },
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_5COM, 0xff, 0x00, 0x00) },
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM, 0xff, 0x02, 0x01) },
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM, 0xff, 0x00, 0x00) },
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7208_1COM, 0x02, 0x00, 0x00) },
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7208_2COM, 0x02, 0x02, 0x01) },
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FP_1COM, 0x0a, 0x00, 0x00) },
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FP_2COM, 0x0a, 0x00, 0x00) },
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FPDC_1COM, 0x0a, 0x00, 0x00) },
+ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FPDC_2COM, 0x0a, 0x00, 0x00) },
{ USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) },
{ } /* Terminating entry */
};
--
1.7.7.6
@@ -0,0 +1,78 @@
From 6fb488dec8482c866a2c7cd4d1da06b85b8b28c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Mon, 2 Jul 2012 10:33:14 +0200
Subject: [PATCH 031/109] USB: cdc-wdm: fix lockup on error in wdm_read
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
commit b086b6b10d9f182cd8d2f0dcfd7fd11edba93fc9 upstream.
Clear the WDM_READ flag on empty reads to avoid running
forever in an infinite tight loop, causing lockups:
Jul 1 21:58:11 nemi kernel: [ 3658.898647] qmi_wwan 2-1:1.2: Unexpected error -71
Jul 1 21:58:36 nemi kernel: [ 3684.072021] BUG: soft lockup - CPU#0 stuck for 23s! [qmi.pl:12235]
Jul 1 21:58:36 nemi kernel: [ 3684.072212] CPU 0
Jul 1 21:58:36 nemi kernel: [ 3684.072355]
Jul 1 21:58:36 nemi kernel: [ 3684.072367] Pid: 12235, comm: qmi.pl Tainted: P O 3.5.0-rc2+ #13 LENOVO 2776LEG/2776LEG
Jul 1 21:58:36 nemi kernel: [ 3684.072383] RIP: 0010:[<ffffffffa0635008>] [<ffffffffa0635008>] spin_unlock_irq+0x8/0xc [cdc_wdm]
Jul 1 21:58:36 nemi kernel: [ 3684.072388] RSP: 0018:ffff88022dca1e70 EFLAGS: 00000282
Jul 1 21:58:36 nemi kernel: [ 3684.072393] RAX: ffff88022fc3f650 RBX: ffffffff811c56f7 RCX: 00000001000ce8c1
Jul 1 21:58:36 nemi kernel: [ 3684.072398] RDX: 0000000000000010 RSI: 000000000267d810 RDI: ffff88022fc3f650
Jul 1 21:58:36 nemi kernel: [ 3684.072403] RBP: ffff88022dca1eb0 R08: ffffffffa063578e R09: 0000000000000000
Jul 1 21:58:36 nemi kernel: [ 3684.072407] R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000000002
Jul 1 21:58:36 nemi kernel: [ 3684.072412] R13: 0000000000000246 R14: ffffffff00000002 R15: ffff8802281d8c88
Jul 1 21:58:36 nemi kernel: [ 3684.072418] FS: 00007f666a260700(0000) GS:ffff88023bc00000(0000) knlGS:0000000000000000
Jul 1 21:58:36 nemi kernel: [ 3684.072423] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jul 1 21:58:36 nemi kernel: [ 3684.072428] CR2: 000000000270d9d8 CR3: 000000022e865000 CR4: 00000000000007f0
Jul 1 21:58:36 nemi kernel: [ 3684.072433] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jul 1 21:58:36 nemi kernel: [ 3684.072438] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Jul 1 21:58:36 nemi kernel: [ 3684.072444] Process qmi.pl (pid: 12235, threadinfo ffff88022dca0000, task ffff88022ff76380)
Jul 1 21:58:36 nemi kernel: [ 3684.072448] Stack:
Jul 1 21:58:36 nemi kernel: [ 3684.072458] ffffffffa063592e 0000000100020000 ffff88022fc3f650 ffff88022fc3f6a8
Jul 1 21:58:36 nemi kernel: [ 3684.072466] 0000000000000200 0000000100000000 000000000267d810 0000000000000000
Jul 1 21:58:36 nemi kernel: [ 3684.072475] 0000000000000000 ffff880212cfb6d0 0000000000000200 ffff880212cfb6c0
Jul 1 21:58:36 nemi kernel: [ 3684.072479] Call Trace:
Jul 1 21:58:36 nemi kernel: [ 3684.072489] [<ffffffffa063592e>] ? wdm_read+0x1a0/0x263 [cdc_wdm]
Jul 1 21:58:36 nemi kernel: [ 3684.072500] [<ffffffff8110adb7>] ? vfs_read+0xa1/0xfb
Jul 1 21:58:36 nemi kernel: [ 3684.072509] [<ffffffff81040589>] ? alarm_setitimer+0x35/0x64
Jul 1 21:58:36 nemi kernel: [ 3684.072517] [<ffffffff8110aec7>] ? sys_read+0x45/0x6e
Jul 1 21:58:36 nemi kernel: [ 3684.072525] [<ffffffff813725f9>] ? system_call_fastpath+0x16/0x1b
Jul 1 21:58:36 nemi kernel: [ 3684.072557] Code: <66> 66 90 c3 83 ff ed 89 f8 74 16 7f 06 83 ff a1 75 0a c3 83 ff f4
The WDM_READ flag is normally cleared by wdm_int_callback
before resubmitting the read urb, and set by wdm_in_callback
when this urb returns with data or an error. But a crashing
device may cause both a read error and cancelling all urbs.
Make sure that the flag is cleared by wdm_read if the buffer
is empty.
We don't clear the flag on errors, as there may be pending
data in the buffer which should be processed. The flag will
instead be cleared on the next wdm_read call.
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Acked-by: Oliver Neukum <oneukum@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/usb/class/cdc-wdm.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 19fb5fa..9aaed0d 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -473,6 +473,8 @@ retry:
goto retry;
}
if (!desc->reslength) { /* zero length read */
+ dev_dbg(&desc->intf->dev, "%s: zero length - clearing WDM_READ\n", __func__);
+ clear_bit(WDM_READ, &desc->flags);
spin_unlock_irq(&desc->iuspin);
goto retry;
}
--
1.7.7.6
@@ -0,0 +1,94 @@
From bb5561cb838492a05e5dae25114d768828fe2dfe Mon Sep 17 00:00:00 2001
From: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com>
Date: Wed, 16 May 2012 16:21:52 -0300
Subject: [PATCH 032/109] mtd: nandsim: don't open code a do_div helper
commit 596fd46268634082314b3af1ded4612e1b7f3f03 upstream.
We don't need to open code the divide function, just use div_u64 that
already exists and do the same job. While this is a straightforward
clean up, there is more to that, the real motivation for this.
While building on a cross compiling environment in armel, using gcc
4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5), I was getting the following build
error:
ERROR: "__aeabi_uldivmod" [drivers/mtd/nand/nandsim.ko] undefined!
After investigating with objdump and hand built assembly version
generated with the compiler, I narrowed __aeabi_uldivmod as being
generated from the divide function. When nandsim.c is built with
-fno-inline-functions-called-once, that happens when
CONFIG_DEBUG_SECTION_MISMATCH is enabled, the do_div optimization in
arch/arm/include/asm/div64.h doesn't work as expected with the open
coded divide function: even if the do_div we are using doesn't have a
constant divisor, the compiler still includes the else parts of the
optimized do_div macro, and translates the divisions there to use
__aeabi_uldivmod, instead of only calling __do_div_asm -> __do_div64 and
optimizing/removing everything else out.
So to reproduce, gcc 4.6 plus CONFIG_DEBUG_SECTION_MISMATCH=y and
CONFIG_MTD_NAND_NANDSIM=m should do it, building on armel.
After this change, the compiler does the intended thing even with
-fno-inline-functions-called-once, and optimizes out as expected the
constant handling in the optimized do_div on arm. As this also avoids a
build issue, I'm marking for Stable, as I think is applicable for this
case.
Signed-off-by: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com>
Acked-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/mtd/nand/nandsim.c | 12 +++---------
1 files changed, 3 insertions(+), 9 deletions(-)
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 34c03be..83e8e1b 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -28,7 +28,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/vmalloc.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/string.h>
@@ -547,12 +547,6 @@ static char *get_partition_name(int i)
return kstrdup(buf, GFP_KERNEL);
}
-static uint64_t divide(uint64_t n, uint32_t d)
-{
- do_div(n, d);
- return n;
-}
-
/*
* Initialize the nandsim structure.
*
@@ -581,7 +575,7 @@ static int init_nandsim(struct mtd_info *mtd)
ns->geom.oobsz = mtd->oobsize;
ns->geom.secsz = mtd->erasesize;
ns->geom.pgszoob = ns->geom.pgsz + ns->geom.oobsz;
- ns->geom.pgnum = divide(ns->geom.totsz, ns->geom.pgsz);
+ ns->geom.pgnum = div_u64(ns->geom.totsz, ns->geom.pgsz);
ns->geom.totszoob = ns->geom.totsz + (uint64_t)ns->geom.pgnum * ns->geom.oobsz;
ns->geom.secshift = ffs(ns->geom.secsz) - 1;
ns->geom.pgshift = chip->page_shift;
@@ -924,7 +918,7 @@ static int setup_wear_reporting(struct mtd_info *mtd)
if (!rptwear)
return 0;
- wear_eb_count = divide(mtd->size, mtd->erasesize);
+ wear_eb_count = div_u64(mtd->size, mtd->erasesize);
mem = wear_eb_count * sizeof(unsigned long);
if (mem / sizeof(unsigned long) != wear_eb_count) {
NS_ERR("Too many erase blocks for wear reporting\n");
--
1.7.7.6
@@ -0,0 +1,33 @@
From 9b4b8dd705a1a7f9c4b7c2128663d2e31b1d0265 Mon Sep 17 00:00:00 2001
From: Santosh Nayak <santoshprasadnayak@gmail.com>
Date: Sat, 23 Jun 2012 07:59:54 -0300
Subject: [PATCH 033/109] dvb-core: Release semaphore on error path
dvb_register_device()
commit 82163edcdfa4eb3d74516cc8e9f38dd3d039b67d upstream.
There is a missing "up_write()" here. Semaphore should be released
before returning error value.
Signed-off-by: Santosh Nayak <santoshprasadnayak@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/media/dvb/dvb-core/dvbdev.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c
index f732877..d5cda35 100644
--- a/drivers/media/dvb/dvb-core/dvbdev.c
+++ b/drivers/media/dvb/dvb-core/dvbdev.c
@@ -243,6 +243,7 @@ int dvb_register_device(struct dvb_adapter *adap, struct dvb_device **pdvbdev,
if (minor == MAX_DVB_MINORS) {
kfree(dvbdevfops);
kfree(dvbdev);
+ up_write(&minor_rwsem);
mutex_unlock(&dvbdev_register_lock);
return -EINVAL;
}
--
1.7.7.6
@@ -0,0 +1,52 @@
From 6c6190dbd1e0054c77445ed61dcbc70db441d4d2 Mon Sep 17 00:00:00 2001
From: Shinya Kuribayashi <shinya.kuribayashi.px@renesas.com>
Date: Sat, 7 Jul 2012 13:37:42 +0300
Subject: [PATCH 034/109] hwspinlock/core: use global ID to register
hwspinlocks on multiple devices
commit 476a7eeb60e70ddab138e7cb4bc44ef5ac20782e upstream.
Commit 300bab9770 (hwspinlock/core: register a bank of hwspinlocks in a
single API call, 2011-09-06) introduced 'hwspin_lock_register_single()'
to register numerous (a bank of) hwspinlock instances in a single API,
'hwspin_lock_register()'.
At which time, 'hwspin_lock_register()' accidentally passes 'local IDs'
to 'hwspin_lock_register_single()', despite that ..._single() requires
'global IDs' to register hwspinlocks.
We have to convert into global IDs by supplying the missing 'base_id'.
Signed-off-by: Shinya Kuribayashi <shinya.kuribayashi.px@renesas.com>
[ohad: fix error path of hwspin_lock_register, too]
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/hwspinlock/hwspinlock_core.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c
index 61c9cf1..1201a15 100644
--- a/drivers/hwspinlock/hwspinlock_core.c
+++ b/drivers/hwspinlock/hwspinlock_core.c
@@ -345,7 +345,7 @@ int hwspin_lock_register(struct hwspinlock_device *bank, struct device *dev,
spin_lock_init(&hwlock->lock);
hwlock->bank = bank;
- ret = hwspin_lock_register_single(hwlock, i);
+ ret = hwspin_lock_register_single(hwlock, base_id + i);
if (ret)
goto reg_failed;
}
@@ -354,7 +354,7 @@ int hwspin_lock_register(struct hwspinlock_device *bank, struct device *dev,
reg_failed:
while (--i >= 0)
- hwspin_lock_unregister_single(i);
+ hwspin_lock_unregister_single(base_id + i);
return ret;
}
EXPORT_SYMBOL_GPL(hwspin_lock_register);
--
1.7.7.6
@@ -0,0 +1,116 @@
From 65719aa5de077d1ccbfe535e9b934d6e91d11601 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 22 Jun 2012 10:52:34 -0700
Subject: [PATCH 035/109] libsas: fix taskfile corruption in
sas_ata_qc_fill_rtf
commit 6ef1b512f4e6f936d89aa20be3d97a7ec7c290ac upstream.
fill_result_tf() grabs the taskfile flags from the originating qc which
sas_ata_qc_fill_rtf() promptly overwrites. The presence of an
ata_taskfile in the sata_device makes it tempting to just copy the full
contents in sas_ata_qc_fill_rtf(). However, libata really only wants
the fis contents and expects the other portions of the taskfile to not
be touched by ->qc_fill_rtf. To that end store a fis buffer in the
sata_device and use ata_tf_from_fis() like every other ->qc_fill_rtf()
implementation.
Reported-by: Praveen Murali <pmurali@logicube.com>
Tested-by: Praveen Murali <pmurali@logicube.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
[bwh: Backported to 3.2: adjust context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/scsi/aic94xx/aic94xx_task.c | 2 +-
drivers/scsi/libsas/sas_ata.c | 12 ++++++------
include/scsi/libsas.h | 6 ++++--
3 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/drivers/scsi/aic94xx/aic94xx_task.c b/drivers/scsi/aic94xx/aic94xx_task.c
index 532d212..393e7ce 100644
--- a/drivers/scsi/aic94xx/aic94xx_task.c
+++ b/drivers/scsi/aic94xx/aic94xx_task.c
@@ -201,7 +201,7 @@ static void asd_get_response_tasklet(struct asd_ascb *ascb,
if (SAS_STATUS_BUF_SIZE >= sizeof(*resp)) {
resp->frame_len = le16_to_cpu(*(__le16 *)(r+6));
- memcpy(&resp->ending_fis[0], r+16, 24);
+ memcpy(&resp->ending_fis[0], r+16, ATA_RESP_FIS_SIZE);
ts->buf_valid_size = sizeof(*resp);
}
}
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index db9238f..4868fc9 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -112,12 +112,12 @@ static void sas_ata_task_done(struct sas_task *task)
if (stat->stat == SAS_PROTO_RESPONSE || stat->stat == SAM_STAT_GOOD ||
((stat->stat == SAM_STAT_CHECK_CONDITION &&
dev->sata_dev.command_set == ATAPI_COMMAND_SET))) {
- ata_tf_from_fis(resp->ending_fis, &dev->sata_dev.tf);
+ memcpy(dev->sata_dev.fis, resp->ending_fis, ATA_RESP_FIS_SIZE);
if (!link->sactive) {
- qc->err_mask |= ac_err_mask(dev->sata_dev.tf.command);
+ qc->err_mask |= ac_err_mask(dev->sata_dev.fis[2]);
} else {
- link->eh_info.err_mask |= ac_err_mask(dev->sata_dev.tf.command);
+ link->eh_info.err_mask |= ac_err_mask(dev->sata_dev.fis[2]);
if (unlikely(link->eh_info.err_mask))
qc->flags |= ATA_QCFLAG_FAILED;
}
@@ -138,8 +138,8 @@ static void sas_ata_task_done(struct sas_task *task)
qc->flags |= ATA_QCFLAG_FAILED;
}
- dev->sata_dev.tf.feature = 0x04; /* status err */
- dev->sata_dev.tf.command = ATA_ERR;
+ dev->sata_dev.fis[3] = 0x04; /* status err */
+ dev->sata_dev.fis[2] = ATA_ERR;
}
}
@@ -252,7 +252,7 @@ static bool sas_ata_qc_fill_rtf(struct ata_queued_cmd *qc)
{
struct domain_device *dev = qc->ap->private_data;
- memcpy(&qc->result_tf, &dev->sata_dev.tf, sizeof(qc->result_tf));
+ ata_tf_from_fis(dev->sata_dev.fis, &qc->result_tf);
return true;
}
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index 6a308d4..1e100c6 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -159,6 +159,8 @@ enum ata_command_set {
ATAPI_COMMAND_SET = 1,
};
+#define ATA_RESP_FIS_SIZE 24
+
struct sata_device {
enum ata_command_set command_set;
struct smp_resp rps_resp; /* report_phy_sata_resp */
@@ -170,7 +172,7 @@ struct sata_device {
struct ata_port *ap;
struct ata_host ata_host;
- struct ata_taskfile tf;
+ u8 fis[ATA_RESP_FIS_SIZE];
u32 sstatus;
u32 serror;
u32 scontrol;
@@ -486,7 +488,7 @@ enum exec_status {
*/
struct ata_task_resp {
u16 frame_len;
- u8 ending_fis[24]; /* dev to host or data-in */
+ u8 ending_fis[ATA_RESP_FIS_SIZE]; /* dev to host or data-in */
u32 sstatus;
u32 serror;
u32 scontrol;
--
1.7.7.6
@@ -0,0 +1,58 @@
From 2710006f98cf587ce6b3108f543689de1bb6d60b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 9 Jul 2012 11:34:13 +1000
Subject: [PATCH 036/109] md/raid1: fix use-after-free bug in RAID1 data-check
code.
commit 2d4f4f3384d4ef4f7c571448e803a1ce721113d5 upstream.
This bug has been present ever since data-check was introduce
in 2.6.16. However it would only fire if a data-check were
done on a degraded array, which was only possible if the array
has 3 or more devices. This is certainly possible, but is quite
uncommon.
Since hot-replace was added in 3.3 it can happen more often as
the same condition can arise if not all possible replacements are
present.
The problem is that as soon as we submit the last read request, the
'r1_bio' structure could be freed at any time, so we really should
stop looking at it. If the last device is being read from we will
stop looking at it. However if the last device is not due to be read
from, we will still check the bio pointer in the r1_bio, but the
r1_bio might already be free.
So use the read_targets counter to make sure we stop looking for bios
to submit as soon as we have submitted them all.
This fix is suitable for any -stable kernel since 2.6.16.
Reported-by: Arnold Schulz <arnysch@gmx.net>
Signed-off-by: NeilBrown <neilb@suse.de>
[bwh: Backported to 3.2: no doubling of conf->raid_disks; we don't have
hot-replace support]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/md/raid1.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7af60ec..58f0055 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2378,9 +2378,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
*/
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
atomic_set(&r1_bio->remaining, read_targets);
- for (i=0; i<conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks && read_targets; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
+ read_targets--;
md_sync_acct(bio->bi_bdev, nr_sectors);
generic_make_request(bio);
}
--
1.7.7.6
@@ -0,0 +1,166 @@
From 45cd1f6207fb66990e5f25e11fb4cd9486c31794 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 9 Jul 2012 11:09:21 -0400
Subject: [PATCH 037/109] PCI: EHCI: fix crash during suspend on ASUS
computers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
commit dbf0e4c7257f8d684ec1a3c919853464293de66e upstream.
Quite a few ASUS computers experience a nasty problem, related to the
EHCI controllers, when going into system suspend. It was observed
that the problem didn't occur if the controllers were not put into the
D3 power state before starting the suspend, and commit
151b61284776be2d6f02d48c23c3625678960b97 (USB: EHCI: fix crash during
suspend on ASUS computers) was created to do this.
It turned out this approach messed up other computers that didn't have
the problem -- it prevented USB wakeup from working. Consequently
commit c2fb8a3fa25513de8fedb38509b1f15a5bbee47b (USB: add
NO_D3_DURING_SLEEP flag and revert 151b61284776be2) was merged; it
reverted the earlier commit and added a whitelist of known good board
names.
Now we know the actual cause of the problem. Thanks to AceLan Kao for
tracking it down.
According to him, an engineer at ASUS explained that some of their
BIOSes contain a bug that was added in an attempt to work around a
problem in early versions of Windows. When the computer goes into S3
suspend, the BIOS tries to verify that the EHCI controllers were first
quiesced by the OS. Nothing's wrong with this, but the BIOS does it
by checking that the PCI COMMAND registers contain 0 without checking
the controllers' power state. If the register isn't 0, the BIOS
assumes the controller needs to be quiesced and tries to do so. This
involves making various MMIO accesses to the controller, which don't
work very well if the controller is already in D3. The end result is
a system hang or memory corruption.
Since the value in the PCI COMMAND register doesn't matter once the
controller has been suspended, and since the value will be restored
anyway when the controller is resumed, we can work around the BIOS bug
simply by setting the register to 0 during system suspend. This patch
(as1590) does so and also reverts the second commit mentioned above,
which is now unnecessary.
In theory we could do this for every PCI device. However to avoid
introducing new problems, the patch restricts itself to EHCI host
controllers.
Finally the affected systems can suspend with USB wakeup working
properly.
Reference: https://bugzilla.kernel.org/show_bug.cgi?id=37632
Reference: https://bugzilla.kernel.org/show_bug.cgi?id=42728
Based-on-patch-by: AceLan Kao <acelan.kao@canonical.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Tested-by: Dâniel Fraga <fragabr@gmail.com>
Tested-by: Javier Marcet <jmarcet@gmail.com>
Tested-by: Andrey Rahmatullin <wrar@wrar.name>
Tested-by: Oleksij Rempel <bug-track@fisher-privat.net>
Tested-by: Pavel Pisa <pisa@cmp.felk.cvut.cz>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/pci/pci-driver.c | 12 ++++++++++++
drivers/pci/pci.c | 5 -----
drivers/pci/quirks.c | 26 --------------------------
include/linux/pci.h | 2 --
4 files changed, 12 insertions(+), 33 deletions(-)
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 12d1e81..d024f83 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -742,6 +742,18 @@ static int pci_pm_suspend_noirq(struct device *dev)
pci_pm_set_unknown_state(pci_dev);
+ /*
+ * Some BIOSes from ASUS have a bug: If a USB EHCI host controller's
+ * PCI COMMAND register isn't 0, the BIOS assumes that the controller
+ * hasn't been quiesced and tries to turn it off. If the controller
+ * is already in D3, this can hang or cause memory corruption.
+ *
+ * Since the value of the COMMAND register doesn't matter once the
+ * device has been suspended, we can safely set it to 0 here.
+ */
+ if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+ pci_write_config_word(pci_dev, PCI_COMMAND, 0);
+
return 0;
}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e5b75eb..6d4a531 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1689,11 +1689,6 @@ int pci_prepare_to_sleep(struct pci_dev *dev)
if (target_state == PCI_POWER_ERROR)
return -EIO;
- /* Some devices mustn't be in D3 during system sleep */
- if (target_state == PCI_D3hot &&
- (dev->dev_flags & PCI_DEV_FLAGS_NO_D3_DURING_SLEEP))
- return 0;
-
pci_enable_wake(dev, target_state, device_may_wakeup(&dev->dev));
error = pci_set_power_state(dev, target_state);
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 3c56fec..78fda9c 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -2940,32 +2940,6 @@ static void __devinit disable_igfx_irq(struct pci_dev *dev)
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0102, disable_igfx_irq);
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x010a, disable_igfx_irq);
-/*
- * The Intel 6 Series/C200 Series chipset's EHCI controllers on many
- * ASUS motherboards will cause memory corruption or a system crash
- * if they are in D3 while the system is put into S3 sleep.
- */
-static void __devinit asus_ehci_no_d3(struct pci_dev *dev)
-{
- const char *sys_info;
- static const char good_Asus_board[] = "P8Z68-V";
-
- if (dev->dev_flags & PCI_DEV_FLAGS_NO_D3_DURING_SLEEP)
- return;
- if (dev->subsystem_vendor != PCI_VENDOR_ID_ASUSTEK)
- return;
- sys_info = dmi_get_system_info(DMI_BOARD_NAME);
- if (sys_info && memcmp(sys_info, good_Asus_board,
- sizeof(good_Asus_board) - 1) == 0)
- return;
-
- dev_info(&dev->dev, "broken D3 during system sleep on ASUS\n");
- dev->dev_flags |= PCI_DEV_FLAGS_NO_D3_DURING_SLEEP;
- device_set_wakeup_capable(&dev->dev, false);
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1c26, asus_ehci_no_d3);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1c2d, asus_ehci_no_d3);
-
static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f,
struct pci_fixup *end)
{
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c0cfa0d..7cda65b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -176,8 +176,6 @@ enum pci_dev_flags {
PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) 2,
/* Provide indication device is assigned by a Virtual Machine Manager */
PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) 4,
- /* Device causes system crash if in D3 during S3 sleep */
- PCI_DEV_FLAGS_NO_D3_DURING_SLEEP = (__force pci_dev_flags_t) 8,
};
enum pci_irq_reroute_variant {
--
1.7.7.6
@@ -0,0 +1,114 @@
From c9a4beeb70f62ec5976dcbb9086683fda56d6aec Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@huawei.com>
Date: Wed, 11 Jul 2012 14:01:52 -0700
Subject: [PATCH 038/109] memory hotplug: fix invalid memory access caused by
stale kswapd pointer
commit d8adde17e5f858427504725218c56aef90e90fc7 upstream.
kswapd_stop() is called to destroy the kswapd work thread when all memory
of a NUMA node has been offlined. But kswapd_stop() only terminates the
work thread without resetting NODE_DATA(nid)->kswapd to NULL. The stale
pointer will prevent kswapd_run() from creating a new work thread when
adding memory to the memory-less NUMA node again. Eventually the stale
pointer may cause invalid memory access.
An example stack dump as below. It's reproduced with 2.6.32, but latest
kernel has the same issue.
BUG: unable to handle kernel NULL pointer dereference at (null)
IP: [<ffffffff81051a94>] exit_creds+0x12/0x78
PGD 0
Oops: 0000 [#1] SMP
last sysfs file: /sys/devices/system/memory/memory391/state
CPU 11
Modules linked in: cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq microcode fuse loop dm_mod tpm_tis rtc_cmos i2c_i801 rtc_core tpm serio_raw pcspkr sg tpm_bios igb i2c_core iTCO_wdt rtc_lib mptctl iTCO_vendor_support button dca bnx2 usbhid hid uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan ide_pci_generic ide_core ata_generic ata_piix libata thermal processor thermal_sys hwmon mptsas mptscsih mptbase scsi_transport_sas scsi_mod
Pid: 7949, comm: sh Not tainted 2.6.32.12-qiuxishi-5-default #92 Tecal RH2285
RIP: 0010:exit_creds+0x12/0x78
RSP: 0018:ffff8806044f1d78 EFLAGS: 00010202
RAX: 0000000000000000 RBX: ffff880604f22140 RCX: 0000000000019502
RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000
RBP: ffff880604f22150 R08: 0000000000000000 R09: ffffffff81a4dc10
R10: 00000000000032a0 R11: ffff880006202500 R12: 0000000000000000
R13: 0000000000c40000 R14: 0000000000008000 R15: 0000000000000001
FS: 00007fbc03d066f0(0000) GS:ffff8800282e0000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000000 CR3: 000000060f029000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process sh (pid: 7949, threadinfo ffff8806044f0000, task ffff880603d7c600)
Stack:
ffff880604f22140 ffffffff8103aac5 ffff880604f22140 ffffffff8104d21e
ffff880006202500 0000000000008000 0000000000c38000 ffffffff810bd5b1
0000000000000000 ffff880603d7c600 00000000ffffdd29 0000000000000003
Call Trace:
__put_task_struct+0x5d/0x97
kthread_stop+0x50/0x58
offline_pages+0x324/0x3da
memory_block_change_state+0x179/0x1db
store_mem_state+0x9e/0xbb
sysfs_write_file+0xd0/0x107
vfs_write+0xad/0x169
sys_write+0x45/0x6e
system_call_fastpath+0x16/0x1b
Code: ff 4d 00 0f 94 c0 84 c0 74 08 48 89 ef e8 1f fd ff ff 5b 5d 31 c0 41 5c c3 53 48 8b 87 20 06 00 00 48 89 fb 48 8b bf 18 06 00 00 <8b> 00 48 c7 83 18 06 00 00 00 00 00 00 f0 ff 0f 0f 94 c0 84 c0
RIP exit_creds+0x12/0x78
RSP <ffff8806044f1d78>
CR2: 0000000000000000
[akpm@linux-foundation.org: add pglist_data.kswapd locking comments]
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
include/linux/mmzone.h | 2 +-
mm/vmscan.c | 7 +++++--
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 188cb2f..905b1e1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -652,7 +652,7 @@ typedef struct pglist_data {
range, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
- struct task_struct *kswapd;
+ struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
int kswapd_max_order;
enum zone_type classzone_idx;
} pg_data_t;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fbe2d2c..72cf498 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3090,14 +3090,17 @@ int kswapd_run(int nid)
}
/*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold lock_memory_hotplug().
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
- if (kswapd)
+ if (kswapd) {
kthread_stop(kswapd);
+ NODE_DATA(nid)->kswapd = NULL;
+ }
}
static int __init kswapd_init(void)
--
1.7.7.6
@@ -0,0 +1,43 @@
From 827be17f16d9325db1c05f012ceab345139f7cad Mon Sep 17 00:00:00 2001
From: Luis Henriques <luis.henriques@canonical.com>
Date: Wed, 11 Jul 2012 14:02:10 -0700
Subject: [PATCH 039/109] ocfs2: fix NULL pointer dereference in
__ocfs2_change_file_space()
commit a4e08d001f2e50bb8b3c4eebadcf08e5535f02ee upstream.
As ocfs2_fallocate() will invoke __ocfs2_change_file_space() with a NULL
as the first parameter (file), it may trigger a NULL pointer dereferrence
due to a missing check.
Addresses http://bugs.launchpad.net/bugs/1006012
Signed-off-by: Luis Henriques <luis.henriques@canonical.com>
Reported-by: Bret Towe <magnade@gmail.com>
Tested-by: Bret Towe <magnade@gmail.com>
Cc: Sunil Mushran <sunil.mushran@oracle.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/ocfs2/file.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 07ee5b4..1c7d45e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1950,7 +1950,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (ret < 0)
mlog_errno(ret);
- if (file->f_flags & O_SYNC)
+ if (file && (file->f_flags & O_SYNC))
handle->h_sync = 1;
ocfs2_commit_trans(osb, handle);
--
1.7.7.6
@@ -0,0 +1,53 @@
From 6a918e81eb228757f20a244ee0d81c32ba7feedc Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 11 Jul 2012 14:02:13 -0700
Subject: [PATCH 040/109] mm, thp: abort compaction if migration page cannot
be charged to memcg
commit 4bf2bba3750f10aa9e62e6949bc7e8329990f01b upstream.
If page migration cannot charge the temporary page to the memcg,
migrate_pages() will return -ENOMEM. This isn't considered in memory
compaction however, and the loop continues to iterate over all
pageblocks trying to isolate and migrate pages. If a small number of
very large memcgs happen to be oom, however, these attempts will mostly
be futile leading to an enormous amout of cpu consumption due to the
page migration failures.
This patch will short circuit and fail memory compaction if
migrate_pages() returns -ENOMEM. COMPACT_PARTIAL is returned in case
some migrations were successful so that the page allocator will retry.
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
mm/compaction.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)
diff --git a/mm/compaction.c b/mm/compaction.c
index 8fb8a40..50f1c60 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -592,8 +592,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
if (err) {
putback_lru_pages(&cc->migratepages);
cc->nr_migratepages = 0;
+ if (err == -ENOMEM) {
+ ret = COMPACT_PARTIAL;
+ goto out;
+ }
}
-
}
out:
--
1.7.7.6
@@ -0,0 +1,74 @@
From 810c142eafb17318d32209a727060a756a57235d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Th=C3=A9baudeau?=
<benoit.thebaudeau@advansee.com>
Date: Wed, 11 Jul 2012 14:02:32 -0700
Subject: [PATCH 041/109] drivers/rtc/rtc-mxc.c: fix irq enabled interrupts
warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
commit b59f6d1febd6cbe9fae4589bf72da0ed32bc69e0 upstream.
Fixes
WARNING: at irq/handle.c:146 handle_irq_event_percpu+0x19c/0x1b8()
irq 25 handler mxc_rtc_interrupt+0x0/0xac enabled interrupts
Modules linked in:
(unwind_backtrace+0x0/0xf0) from (warn_slowpath_common+0x4c/0x64)
(warn_slowpath_common+0x4c/0x64) from (warn_slowpath_fmt+0x30/0x40)
(warn_slowpath_fmt+0x30/0x40) from (handle_irq_event_percpu+0x19c/0x1b8)
(handle_irq_event_percpu+0x19c/0x1b8) from (handle_irq_event+0x28/0x38)
(handle_irq_event+0x28/0x38) from (handle_level_irq+0x80/0xc4)
(handle_level_irq+0x80/0xc4) from (generic_handle_irq+0x24/0x38)
(generic_handle_irq+0x24/0x38) from (handle_IRQ+0x30/0x84)
(handle_IRQ+0x30/0x84) from (avic_handle_irq+0x2c/0x4c)
(avic_handle_irq+0x2c/0x4c) from (__irq_svc+0x40/0x60)
Exception stack(0xc050bf60 to 0xc050bfa8)
bf60: 00000001 00000000 003c4208 c0018e20 c050a000 c050a000 c054a4c8 c050a000
bf80: c05157a8 4117b363 80503bb4 00000000 01000000 c050bfa8 c0018e2c c000e808
bfa0: 60000013 ffffffff
(__irq_svc+0x40/0x60) from (default_idle+0x1c/0x30)
(default_idle+0x1c/0x30) from (cpu_idle+0x68/0xa8)
(cpu_idle+0x68/0xa8) from (start_kernel+0x22c/0x26c)
Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Sascha Hauer <kernel@pengutronix.de>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/rtc/rtc-mxc.c | 5 +++--
1 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index 39e41fb..5160354 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -191,10 +191,11 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id)
struct platform_device *pdev = dev_id;
struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
void __iomem *ioaddr = pdata->ioaddr;
+ unsigned long flags;
u32 status;
u32 events = 0;
- spin_lock_irq(&pdata->rtc->irq_lock);
+ spin_lock_irqsave(&pdata->rtc->irq_lock, flags);
status = readw(ioaddr + RTC_RTCISR) & readw(ioaddr + RTC_RTCIENR);
/* clear interrupt sources */
writew(status, ioaddr + RTC_RTCISR);
@@ -217,7 +218,7 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id)
rtc_update_alarm(&pdev->dev, &pdata->g_rtc_alarm);
rtc_update_irq(pdata->rtc, 1, events);
- spin_unlock_irq(&pdata->rtc->irq_lock);
+ spin_unlock_irqrestore(&pdata->rtc->irq_lock, flags);
return IRQ_HANDLED;
}
--
1.7.7.6
@@ -0,0 +1,49 @@
From e3b50d463ce69af9fb5b2d4a54e6c37637c2ecef Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Wed, 11 Jul 2012 14:02:35 -0700
Subject: [PATCH 042/109] fs: ramfs: file-nommu: add SetPageUptodate()
commit fea9f718b3d68147f162ed2d870183ce5e0ad8d8 upstream.
There is a bug in the below scenario for !CONFIG_MMU:
1. create a new file
2. mmap the file and write to it
3. read the file can't get the correct value
Because
sys_read() -> generic_file_aio_read() -> simple_readpage() -> clear_page()
which causes the page to be zeroed.
Add SetPageUptodate() to ramfs_nommu_expand_for_mapping() so that
generic_file_aio_read() do not call simple_readpage().
Signed-off-by: Bob Liu <lliubbo@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/ramfs/file-nommu.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index fbb0b47..d5378d0 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -110,6 +110,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
/* prevent the page from being discarded on memory pressure */
SetPageDirty(page);
+ SetPageUptodate(page);
unlock_page(page);
put_page(page);
--
1.7.7.6
@@ -0,0 +1,53 @@
From 8d63c0484f5ed79f498c5bf3e5b90de29555bcb7 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Thu, 12 Jul 2012 12:24:33 +0200
Subject: [PATCH 043/109] cpufreq / ACPI: Fix not loading acpi-cpufreq driver
regression
commit c4686c71a9183f76e3ef59098da5c098748672f6 upstream.
Commit d640113fe80e45ebd4a5b420b introduced a regression on SMP
systems where the processor core with ACPI id zero is disabled
(typically should be the case because of hyperthreading).
The regression got spread through stable kernels.
On 3.0.X it got introduced via 3.0.18.
Such platforms may be rare, but do exist.
Look out for a disabled processor with acpi_id 0 in dmesg:
ACPI: LAPIC (acpi_id[0x00] lapic_id[0x10] disabled)
This problem has been observed on a:
HP Proliant BL280c G6 blade
This patch restricts the introduced workaround to platforms
with nr_cpu_ids <= 1.
Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/acpi/processor_core.c | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index c850de4..eff7222 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -189,10 +189,12 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
* Processor (CPU3, 0x03, 0x00000410, 0x06) {}
* }
*
- * Ignores apic_id and always return 0 for CPU0's handle.
+ * Ignores apic_id and always returns 0 for the processor
+ * handle with acpi id 0 if nr_cpu_ids is 1.
+ * This should be the case if SMP tables are not found.
* Return -1 for other CPU's handle.
*/
- if (acpi_id == 0)
+ if (nr_cpu_ids <= 1 && acpi_id == 0)
return acpi_id;
else
return apic_id;
--
1.7.7.6
@@ -0,0 +1,37 @@
From 9b12ab6f8a8a5859e0165b3510dbecae16ca98e3 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Thu, 12 Jul 2012 22:47:37 +0200
Subject: [PATCH 044/109] hwmon: (it87) Preserve configuration register bits
on init
commit 41002f8dd5938d5ad1d008ce5bfdbfe47fa7b4e8 upstream.
We were accidentally losing one bit in the configuration register on
device initialization. It was reported to freeze one specific system
right away. Properly preserve all bits we don't explicitly want to
change in order to prevent that.
Reported-by: Stevie Trujillo <stevie.trujillo@gmail.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/hwmon/it87.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/hwmon/it87.c b/drivers/hwmon/it87.c
index d912649..1ba7af2 100644
--- a/drivers/hwmon/it87.c
+++ b/drivers/hwmon/it87.c
@@ -2086,7 +2086,7 @@ static void __devinit it87_init_device(struct platform_device *pdev)
/* Start monitoring */
it87_write_value(data, IT87_REG_CONFIG,
- (it87_read_value(data, IT87_REG_CONFIG) & 0x36)
+ (it87_read_value(data, IT87_REG_CONFIG) & 0x3e)
| (update_vbat ? 0x41 : 0x01));
}
--
1.7.7.6
@@ -0,0 +1,42 @@
From 81b7824449f04aec76681f7723b0f7911ad66f11 Mon Sep 17 00:00:00 2001
From: Todd Poynor <toddpoynor@google.com>
Date: Fri, 13 Jul 2012 15:30:48 +0900
Subject: [PATCH 045/109] ARM: SAMSUNG: fix race in s3c_adc_start for ADC
commit 8265981bb439f3ecc5356fb877a6c2a6636ac88a upstream.
Checking for adc->ts_pend already claimed should be done with the
lock held.
Signed-off-by: Todd Poynor <toddpoynor@google.com>
Acked-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Kukjin Kim <kgene.kim@samsung.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/arm/plat-samsung/adc.c | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/arch/arm/plat-samsung/adc.c b/arch/arm/plat-samsung/adc.c
index 33ecd0c..b1e05cc 100644
--- a/arch/arm/plat-samsung/adc.c
+++ b/arch/arm/plat-samsung/adc.c
@@ -157,11 +157,13 @@ int s3c_adc_start(struct s3c_adc_client *client,
return -EINVAL;
}
- if (client->is_ts && adc->ts_pend)
- return -EAGAIN;
-
spin_lock_irqsave(&adc->lock, flags);
+ if (client->is_ts && adc->ts_pend) {
+ spin_unlock_irqrestore(&adc->lock, flags);
+ return -EAGAIN;
+ }
+
client->channel = channel;
client->nr_samples = nr_samples;
--
1.7.7.6
@@ -0,0 +1,116 @@
From 898f4d272514d19aafee8cd66b796c0553fca080 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 12 Jul 2012 09:43:14 -0400
Subject: [PATCH 046/109] block: fix infinite loop in __getblk_slow
commit 91f68c89d8f35fe98ea04159b9a3b42d0149478f upstream.
Commit 080399aaaf35 ("block: don't mark buffers beyond end of disk as
mapped") exposed a bug in __getblk_slow that causes mount to hang as it
loops infinitely waiting for a buffer that lies beyond the end of the
disk to become uptodate.
The problem was initially reported by Torsten Hilbrich here:
https://lkml.org/lkml/2012/6/18/54
and also reported independently here:
http://www.sysresccd.org/forums/viewtopic.php?f=13&t=4511
and then Richard W.M. Jones and Marcos Mello noted a few separate
bugzillas also associated with the same issue. This patch has been
confirmed to fix:
https://bugzilla.redhat.com/show_bug.cgi?id=835019
The main problem is here, in __getblk_slow:
for (;;) {
struct buffer_head * bh;
int ret;
bh = __find_get_block(bdev, block, size);
if (bh)
return bh;
ret = grow_buffers(bdev, block, size);
if (ret < 0)
return NULL;
if (ret == 0)
free_more_memory();
}
__find_get_block does not find the block, since it will not be marked as
mapped, and so grow_buffers is called to fill in the buffers for the
associated page. I believe the for (;;) loop is there primarily to
retry in the case of memory pressure keeping grow_buffers from
succeeding. However, we also continue to loop for other cases, like the
block lying beond the end of the disk. So, the fix I came up with is to
only loop when grow_buffers fails due to memory allocation issues
(return value of 0).
The attached patch was tested by myself, Torsten, and Rich, and was
found to resolve the problem in call cases.
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Reported-and-Tested-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
Reviewed-by: Josh Boyer <jwboyer@redhat.com>
[ Jens is on vacation, taking this directly - Linus ]
--
Stable Notes: this patch requires backport to 3.0, 3.2 and 3.3.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/buffer.c | 22 +++++++++++++---------
1 files changed, 13 insertions(+), 9 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index c807931..4115eca 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1087,6 +1087,9 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block, int size)
{
+ int ret;
+ struct buffer_head *bh;
+
/* Size must be multiple of hard sectorsize */
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
(size < 512 || size > PAGE_SIZE))) {
@@ -1099,20 +1102,21 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
return NULL;
}
- for (;;) {
- struct buffer_head * bh;
- int ret;
+retry:
+ bh = __find_get_block(bdev, block, size);
+ if (bh)
+ return bh;
+ ret = grow_buffers(bdev, block, size);
+ if (ret == 0) {
+ free_more_memory();
+ goto retry;
+ } else if (ret > 0) {
bh = __find_get_block(bdev, block, size);
if (bh)
return bh;
-
- ret = grow_buffers(bdev, block, size);
- if (ret < 0)
- return NULL;
- if (ret == 0)
- free_more_memory();
}
+ return NULL;
}
/*
--
1.7.7.6
@@ -0,0 +1,42 @@
From 51b23c5c8a3aacf16acf8b723c35a23c07c37115 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Fri, 13 Jul 2012 13:35:36 -0400
Subject: [PATCH 047/109] Remove easily user-triggerable BUG from
generic_setlease
commit 8d657eb3b43861064d36241e88d9d61c709f33f0 upstream.
This can be trivially triggered from userspace by passing in something unexpected.
kernel BUG at fs/locks.c:1468!
invalid opcode: 0000 [#1] SMP
RIP: 0010:generic_setlease+0xc2/0x100
Call Trace:
__vfs_setlease+0x35/0x40
fcntl_setlease+0x76/0x150
sys_fcntl+0x1c6/0x810
system_call_fastpath+0x1a/0x1f
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/locks.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/fs/locks.c b/fs/locks.c
index 0d68f1f..6a64f15 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1465,7 +1465,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
case F_WRLCK:
return generic_add_lease(filp, arg, flp);
default:
- BUG();
+ return -EINVAL;
}
}
EXPORT_SYMBOL(generic_setlease);
--
1.7.7.6
@@ -0,0 +1,32 @@
From 8750544d6522f38e7f5722ee263d0f95941c9bd8 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Thu, 10 May 2012 19:45:51 +0200
Subject: [PATCH 048/109] NFC: Export nfc.h to userland
commit dbd4fcaf8d664fab4163b1f8682e41ad8bff3444 upstream.
The netlink commands and attributes, along with the socket structure
definitions need to be exported.
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
include/linux/Kbuild | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index bd21ecd..a3ce901 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -268,6 +268,7 @@ header-y += netfilter_ipv4.h
header-y += netfilter_ipv6.h
header-y += netlink.h
header-y += netrom.h
+header-y += nfc.h
header-y += nfs.h
header-y += nfs2.h
header-y += nfs3.h
--
1.7.7.6
@@ -0,0 +1,178 @@
From b6807062ada796cdfde2c0f5ca59390b0c916aae Mon Sep 17 00:00:00 2001
From: Bojan Smojver <bojan@rexursive.com>
Date: Sun, 29 Apr 2012 22:42:06 +0200
Subject: [PATCH 049/109] PM / Hibernate: Hibernate/thaw fixes/improvements
commit 5a21d489fd9541a4a66b9a500659abaca1b19a51 upstream.
1. Do not allocate memory for buffers from emergency pools, unless
absolutely required. Do not warn about and do not retry non-essential
failed allocations.
2. Do not check the amount of free pages left on every single page
write, but wait until one map is completely populated and then check.
3. Set maximum number of pages for read buffering consistently, instead
of inadvertently depending on the size of the sector type.
4. Fix copyright line, which I missed when I submitted the hibernation
threading patch.
5. Dispense with bit shifting arithmetic to improve readability.
6. Really recalculate the number of pages required to be free after all
allocations have been done.
7. Fix calculation of pages required for read buffering. Only count in
pages that do not belong to high memory.
Signed-off-by: Bojan Smojver <bojan@rexursive.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
kernel/power/swap.c | 62 ++++++++++++++++++++++++++++++++-------------------
1 files changed, 39 insertions(+), 23 deletions(-)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b313086..64f8f97 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,7 +6,7 @@
*
* Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
+ * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
*
* This file is released under the GPLv2.
*
@@ -283,14 +283,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
return -ENOSPC;
if (bio_chain) {
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+ __GFP_NORETRY);
if (src) {
copy_page(src, buf);
} else {
ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
if (ret)
return ret;
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ src = (void *)__get_free_page(__GFP_WAIT |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
if (src) {
copy_page(src, buf);
} else {
@@ -368,12 +371,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
clear_page(handle->cur);
handle->cur_swap = offset;
handle->k = 0;
- }
- if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
- error = hib_wait_on_bio_chain(bio_chain);
- if (error)
- goto out;
- handle->reqd_free_pages = reqd_free_pages();
+
+ if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
+ error = hib_wait_on_bio_chain(bio_chain);
+ if (error)
+ goto out;
+ /*
+ * Recalculate the number of required free pages, to
+ * make sure we never take more than half.
+ */
+ handle->reqd_free_pages = reqd_free_pages();
+ }
}
out:
return error;
@@ -420,8 +428,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
/* Maximum number of threads for compression/decompression. */
#define LZO_THREADS 3
-/* Maximum number of pages for read buffering. */
-#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8)
+/* Minimum/maximum number of pages for read buffering. */
+#define LZO_MIN_RD_PAGES 1024
+#define LZO_MAX_RD_PAGES 8192
/**
@@ -632,12 +641,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
}
/*
- * Adjust number of free pages after all allocations have been done.
- * We don't want to run out of pages when writing.
- */
- handle->reqd_free_pages = reqd_free_pages();
-
- /*
* Start the CRC32 thread.
*/
init_waitqueue_head(&crc->go);
@@ -658,6 +661,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
+ /*
+ * Adjust the number of required free pages after all allocations have
+ * been done. We don't want to run out of pages when writing.
+ */
+ handle->reqd_free_pages = reqd_free_pages();
+
printk(KERN_INFO
"PM: Using %u thread(s) for compression.\n"
"PM: Compressing and saving image data (%u pages) ... ",
@@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
unsigned i, thr, run_threads, nr_threads;
unsigned ring = 0, pg = 0, ring_size = 0,
have = 0, want, need, asked = 0;
- unsigned long read_pages;
+ unsigned long read_pages = 0;
unsigned char **page = NULL;
struct dec_data *data = NULL;
struct crc_data *crc = NULL;
@@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
+ page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
if (!page) {
printk(KERN_ERR "PM: Failed to allocate LZO page\n");
ret = -ENOMEM;
@@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
/*
- * Adjust number of pages for read buffering, in case we are short.
+ * Set the number of pages for read buffering.
+ * This is complete guesswork, because we'll only know the real
+ * picture once prepare_image() is called, which is much later on
+ * during the image load phase. We'll assume the worst case and
+ * say that none of the image pages are from high memory.
*/
- read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
- read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
+ if (low_free_pages() > snapshot_get_image_size())
+ read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
+ read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
for (i = 0; i < read_pages; i++) {
page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
__GFP_WAIT | __GFP_HIGH :
- __GFP_WAIT);
+ __GFP_WAIT | __GFP_NOWARN |
+ __GFP_NORETRY);
+
if (!page[i]) {
if (i < LZO_CMP_PAGES) {
ring_size = i;
--
1.7.7.6
@@ -0,0 +1,40 @@
From c83354ed38bbb4ebfa25f954d825594453b160c1 Mon Sep 17 00:00:00 2001
From: Michal Kazior <michal.kazior@tieto.com>
Date: Fri, 8 Jun 2012 10:55:44 +0200
Subject: [PATCH 050/109] cfg80211: check iface combinations only when iface
is running
commit f8cdddb8d61d16a156229f0910f7ecfc7a82c003 upstream.
Don't validate interface combinations on a stopped
interface. Otherwise we might end up being able to
create a new interface with a certain type, but
won't be able to change an existing interface
into that type.
This also skips some other functions when
interface is stopped and changing interface type.
Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
net/wireless/util.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d38815d..74d5292 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -813,7 +813,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
ntype == NL80211_IFTYPE_P2P_CLIENT))
return -EBUSY;
- if (ntype != otype) {
+ if (ntype != otype && netif_running(dev)) {
err = cfg80211_can_change_interface(rdev, dev->ieee80211_ptr,
ntype);
if (err)
--
1.7.7.6
@@ -0,0 +1,70 @@
From ea2ca0ebd427d4a745043e6e030619221fe9a55b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 25 Jun 2012 15:07:17 +0200
Subject: [PATCH 051/109] intel_ips: blacklist HP ProBook laptops
commit 88ca518b0bb4161e5f20f8a1d9cc477cae294e54 upstream.
intel_ips driver spews the warning message
"ME failed to update for more than 1s, likely hung"
at each second endlessly on HP ProBook laptops with IronLake.
As this has never worked, better to blacklist the driver for now.
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/platform/x86/intel_ips.c | 22 ++++++++++++++++++++++
1 files changed, 22 insertions(+), 0 deletions(-)
diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c
index 809a3ae..b46ec11 100644
--- a/drivers/platform/x86/intel_ips.c
+++ b/drivers/platform/x86/intel_ips.c
@@ -72,6 +72,7 @@
#include <linux/string.h>
#include <linux/tick.h>
#include <linux/timer.h>
+#include <linux/dmi.h>
#include <drm/i915_drm.h>
#include <asm/msr.h>
#include <asm/processor.h>
@@ -1505,6 +1506,24 @@ static DEFINE_PCI_DEVICE_TABLE(ips_id_table) = {
MODULE_DEVICE_TABLE(pci, ips_id_table);
+static int ips_blacklist_callback(const struct dmi_system_id *id)
+{
+ pr_info("Blacklisted intel_ips for %s\n", id->ident);
+ return 1;
+}
+
+static const struct dmi_system_id ips_blacklist[] = {
+ {
+ .callback = ips_blacklist_callback,
+ .ident = "HP ProBook",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP ProBook"),
+ },
+ },
+ { } /* terminating entry */
+};
+
static int ips_probe(struct pci_dev *dev, const struct pci_device_id *id)
{
u64 platform_info;
@@ -1514,6 +1533,9 @@ static int ips_probe(struct pci_dev *dev, const struct pci_device_id *id)
u16 htshi, trc, trc_required_mask;
u8 tse;
+ if (dmi_check_system(ips_blacklist))
+ return -ENODEV;
+
ips = kzalloc(sizeof(struct ips_driver), GFP_KERNEL);
if (!ips)
return -ENOMEM;
--
1.7.7.6
@@ -0,0 +1,42 @@
From 0e6bee2eb164145946ea6ca49c4fd1e02c7177fa Mon Sep 17 00:00:00 2001
From: Cloud Ren <cjren@qca.qualcomm.com>
Date: Tue, 3 Jul 2012 16:51:48 +0000
Subject: [PATCH 052/109] atl1c: fix issue of transmit queue 0 timed out
commit b94e52f62683dc0b00c6d1b58b80929a078c0fd5 upstream.
some people report atl1c could cause system hang with following
kernel trace info:
---------------------------------------
WARNING: at.../net/sched/sch_generic.c:258 dev_watchdog+0x1db/0x1d0()
...
NETDEV WATCHDOG: eth0 (atl1c): transmit queue 0 timed out
...
---------------------------------------
This is caused by netif_stop_queue calling when cable Link is down.
So remove netif_stop_queue, because link_watch will take it over.
Signed-off-by: xiong <xiong@qca.qualcomm.com>
Signed-off-by: Cloud Ren <cjren@qca.qualcomm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backported to 3.2: adjust context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/ethernet/atheros/atl1c/atl1c_main.c | 1 -
1 files changed, 0 insertions(+), 1 deletions(-)
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index eccdcff..5ae7df7 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -267,7 +267,6 @@ static void atl1c_check_link_status(struct atl1c_adapter *adapter)
dev_warn(&pdev->dev, "stop mac failed\n");
atl1c_set_aspm(hw, false);
netif_carrier_off(netdev);
- netif_stop_queue(netdev);
atl1c_phy_reset(hw);
atl1c_phy_init(&adapter->hw);
} else {
--
1.7.7.6
@@ -0,0 +1,51 @@
From 6b52d1306665e9da06ac76126a97888849dbf290 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Wed, 4 Jul 2012 13:10:02 +0200
Subject: [PATCH 053/109] rt2x00usb: fix indexes ordering on RX queue kick
commit efd821182cec8c92babef6e00a95066d3252fda4 upstream.
On rt2x00_dmastart() we increase index specified by Q_INDEX and on
rt2x00_dmadone() we increase index specified by Q_INDEX_DONE. So entries
between Q_INDEX_DONE and Q_INDEX are those we currently process in the
hardware. Entries between Q_INDEX and Q_INDEX_DONE are those we can
submit to the hardware.
According to that fix rt2x00usb_kick_queue(), as we need to submit RX
entries that are not processed by the hardware. It worked before only
for empty queue, otherwise was broken.
Note that for TX queues indexes ordering are ok. We need to kick entries
that have filled skb, but was not submitted to the hardware, i.e.
started from Q_INDEX_DONE and have ENTRY_DATA_PENDING bit set.
From practical standpoint this fixes RX queue stall, usually reproducible
in AP mode, like for example reported here:
https://bugzilla.redhat.com/show_bug.cgi?id=828824
Reported-and-tested-by: Franco Miceli <fmiceli@plan.ceibal.edu.uy>
Reported-and-tested-by: Tom Horsley <horsley1953@gmail.com>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/wireless/rt2x00/rt2x00usb.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/net/wireless/rt2x00/rt2x00usb.c b/drivers/net/wireless/rt2x00/rt2x00usb.c
index 1e31050..ba28807 100644
--- a/drivers/net/wireless/rt2x00/rt2x00usb.c
+++ b/drivers/net/wireless/rt2x00/rt2x00usb.c
@@ -426,8 +426,8 @@ void rt2x00usb_kick_queue(struct data_queue *queue)
case QID_RX:
if (!rt2x00queue_full(queue))
rt2x00queue_for_each_entry(queue,
- Q_INDEX_DONE,
Q_INDEX,
+ Q_INDEX_DONE,
NULL,
rt2x00usb_kick_rx_entry);
break;
--
1.7.7.6
@@ -0,0 +1,65 @@
From b7d2c1e70d2c94585ac5839e38b861bdc6d469d2 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Wed, 4 Jul 2012 13:20:20 +0200
Subject: [PATCH 054/109] iwlegacy: always monitor for stuck queues
commit c2ca7d92ed4bbd779516beb6eb226e19f7f7ab0f upstream.
This is iwlegacy version of:
commit 342bbf3fee2fa9a18147e74b2e3c4229a4564912
Author: Johannes Berg <johannes.berg@intel.com>
Date: Sun Mar 4 08:50:46 2012 -0800
iwlwifi: always monitor for stuck queues
If we only monitor while associated, the following
can happen:
- we're associated, and the queue stuck check
runs, setting the queue "touch" time to X
- we disassociate, stopping the monitoring,
which leaves the time set to X
- almost 2s later, we associate, and enqueue
a frame
- before the frame is transmitted, we monitor
for stuck queues, and find the time set to
X, although it is now later than X + 2000ms,
so we decide that the queue is stuck and
erroneously restart the device
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
[bwh: Backported to 3.2: adjust filename, function and variable names]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/wireless/iwlegacy/iwl-core.c | 14 ++++++--------
1 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/drivers/net/wireless/iwlegacy/iwl-core.c b/drivers/net/wireless/iwlegacy/iwl-core.c
index 2bd5659..1bb64c9 100644
--- a/drivers/net/wireless/iwlegacy/iwl-core.c
+++ b/drivers/net/wireless/iwlegacy/iwl-core.c
@@ -1884,14 +1884,12 @@ void iwl_legacy_bg_watchdog(unsigned long data)
return;
/* monitor and check for other stuck queues */
- if (iwl_legacy_is_any_associated(priv)) {
- for (cnt = 0; cnt < priv->hw_params.max_txq_num; cnt++) {
- /* skip as we already checked the command queue */
- if (cnt == priv->cmd_queue)
- continue;
- if (iwl_legacy_check_stuck_queue(priv, cnt))
- return;
- }
+ for (cnt = 0; cnt < priv->hw_params.max_txq_num; cnt++) {
+ /* skip as we already checked the command queue */
+ if (cnt == priv->cmd_queue)
+ continue;
+ if (iwl_legacy_check_stuck_queue(priv, cnt))
+ return;
}
mod_timer(&priv->watchdog, jiffies +
--
1.7.7.6
@@ -0,0 +1,51 @@
From 910c9012a7e02b93cc1f877aa8ef245dd1d99fbe Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Wed, 4 Jul 2012 13:59:08 +0200
Subject: [PATCH 055/109] iwlegacy: don't mess up the SCD when removing a key
commit b48d96652626b315229b1b82c6270eead6a77a6d upstream.
When we remove a key, we put a key index which was supposed
to tell the fw that we are actually removing the key. But
instead the fw took that index as a valid index and messed
up the SRAM of the device.
This memory corruption on the device mangled the data of
the SCD. The impact on the user is that SCD queue 2 got
stuck after having removed keys.
Reported-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
[bwh: Backported to 3.2: adjust filename, context and variable name]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/wireless/iwlegacy/iwl-4965-sta.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/wireless/iwlegacy/iwl-4965-sta.c b/drivers/net/wireless/iwlegacy/iwl-4965-sta.c
index a262c23..0116ca8 100644
--- a/drivers/net/wireless/iwlegacy/iwl-4965-sta.c
+++ b/drivers/net/wireless/iwlegacy/iwl-4965-sta.c
@@ -466,7 +466,7 @@ int iwl4965_remove_dynamic_key(struct iwl_priv *priv,
return 0;
}
- if (priv->stations[sta_id].sta.key.key_offset == WEP_INVALID_OFFSET) {
+ if (priv->stations[sta_id].sta.key.key_flags & STA_KEY_FLG_INVALID) {
IWL_WARN(priv, "Removing wrong key %d 0x%x\n",
keyconf->keyidx, key_flags);
spin_unlock_irqrestore(&priv->sta_lock, flags);
@@ -483,7 +483,7 @@ int iwl4965_remove_dynamic_key(struct iwl_priv *priv,
sizeof(struct iwl4965_keyinfo));
priv->stations[sta_id].sta.key.key_flags =
STA_KEY_FLG_NO_ENC | STA_KEY_FLG_INVALID;
- priv->stations[sta_id].sta.key.key_offset = WEP_INVALID_OFFSET;
+ priv->stations[sta_id].sta.key.key_offset = keyconf->hw_key_idx;
priv->stations[sta_id].sta.sta.modify_mask = STA_MODIFY_KEY_MASK;
priv->stations[sta_id].sta.mode = STA_CONTROL_MODIFY_MSK;
--
1.7.7.6
@@ -0,0 +1,39 @@
From c82dafb3ba87352cb605641f7d709ec76dc64168 Mon Sep 17 00:00:00 2001
From: Tushar Dave <tushar.n.dave@intel.com>
Date: Thu, 12 Jul 2012 08:56:56 +0000
Subject: [PATCH 056/109] e1000e: Correct link check logic for 82571 serdes
commit d0efa8f23a644f7cb7d1f8e78dd9a223efa412a3 upstream.
SYNCH bit and IV bit of RXCW register are sticky. Before examining these bits,
RXCW should be read twice to filter out one-time false events and have correct
values for these bits. Incorrect values of these bits in link check logic can
cause weird link stability issues if auto-negotiation fails.
Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Tushar Dave <tushar.n.dave@intel.com>
Reviewed-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/ethernet/intel/e1000e/82571.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c
index e556fc3..3072d35 100644
--- a/drivers/net/ethernet/intel/e1000e/82571.c
+++ b/drivers/net/ethernet/intel/e1000e/82571.c
@@ -1571,6 +1571,9 @@ static s32 e1000_check_for_serdes_link_82571(struct e1000_hw *hw)
ctrl = er32(CTRL);
status = er32(STATUS);
rxcw = er32(RXCW);
+ /* SYNCH bit and IV bit are sticky */
+ udelay(10);
+ rxcw = er32(RXCW);
if ((rxcw & E1000_RXCW_SYNCH) && !(rxcw & E1000_RXCW_IV)) {
--
1.7.7.6
@@ -0,0 +1,39 @@
From f846f3528f3dcd02646a919a50696d026e0864ae Mon Sep 17 00:00:00 2001
From: Mark Rustad <mark.d.rustad@intel.com>
Date: Fri, 13 Jul 2012 18:18:04 -0700
Subject: [PATCH 057/109] tcm_fc: Fix crash seen with aborts and large reads
commit 3cc5d2a6b9a2fd1bf024aa5e52dd22961eecaf13 upstream.
This patch fixes a crash seen when large reads have their exchange
aborted by either timing out or being reset. Because the exchange
abort results in the seq pointer being set to NULL, because the
sequence is no longer valid, it must not be dereferenced. This
patch changes the function ft_get_task_tag to return ~0 if it is
unable to get the tag for this reason. Because the get_task_tag
interface provides no means of returning an error, this seems
like the best way to fix this issue at the moment.
Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/target/tcm_fc/tfc_cmd.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/drivers/target/tcm_fc/tfc_cmd.c b/drivers/target/tcm_fc/tfc_cmd.c
index d95cfe2..278819c 100644
--- a/drivers/target/tcm_fc/tfc_cmd.c
+++ b/drivers/target/tcm_fc/tfc_cmd.c
@@ -249,6 +249,8 @@ u32 ft_get_task_tag(struct se_cmd *se_cmd)
{
struct ft_cmd *cmd = container_of(se_cmd, struct ft_cmd, se_cmd);
+ if (cmd->aborted)
+ return ~0;
return fc_seq_exch(cmd->seq)->rxid;
}
--
1.7.7.6
@@ -0,0 +1,115 @@
From 13d0304203a528b1c1c76b5c9b6f5b8dc093f996 Mon Sep 17 00:00:00 2001
From: Anders Kaseorg <andersk@MIT.EDU>
Date: Sun, 15 Jul 2012 17:14:25 -0400
Subject: [PATCH 058/109] fifo: Do not restart open() if it already found a
partner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
commit 05d290d66be6ef77a0b962ebecf01911bd984a78 upstream.
If a parent and child process open the two ends of a fifo, and the
child immediately exits, the parent may receive a SIGCHLD before its
open() returns. In that case, we need to make sure that open() will
return successfully after the SIGCHLD handler returns, instead of
throwing EINTR or being restarted. Otherwise, the restarted open()
would incorrectly wait for a second partner on the other end.
The following test demonstrates the EINTR that was wrongly thrown from
the parents open(). Change .sa_flags = 0 to .sa_flags = SA_RESTART
to see a deadlock instead, in which the restarted open() waits for a
second reader that will never come. (On my systems, this happens
pretty reliably within about 5 to 500 iterations. Others report that
it manages to loop ~forever sometimes; YMMV.)
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#define CHECK(x) do if ((x) == -1) {perror(#x); abort();} while(0)
void handler(int signum) {}
int main()
{
struct sigaction act = {.sa_handler = handler, .sa_flags = 0};
CHECK(sigaction(SIGCHLD, &act, NULL));
CHECK(mknod("fifo", S_IFIFO | S_IRWXU, 0));
for (;;) {
int fd;
pid_t pid;
putc('.', stderr);
CHECK(pid = fork());
if (pid == 0) {
CHECK(fd = open("fifo", O_RDONLY));
_exit(0);
}
CHECK(fd = open("fifo", O_WRONLY));
CHECK(close(fd));
CHECK(waitpid(pid, NULL, 0));
}
}
This is what I suspect was causing the Git test suite to fail in
t9010-svn-fe.sh:
http://bugs.debian.org/678852
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
Reviewed-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/fifo.c | 9 ++++-----
1 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/fs/fifo.c b/fs/fifo.c
index b1a524d..cf6f434 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -14,7 +14,7 @@
#include <linux/sched.h>
#include <linux/pipe_fs_i.h>
-static void wait_for_partner(struct inode* inode, unsigned int *cnt)
+static int wait_for_partner(struct inode* inode, unsigned int *cnt)
{
int cur = *cnt;
@@ -23,6 +23,7 @@ static void wait_for_partner(struct inode* inode, unsigned int *cnt)
if (signal_pending(current))
break;
}
+ return cur == *cnt ? -ERESTARTSYS : 0;
}
static void wake_up_partner(struct inode* inode)
@@ -67,8 +68,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
* seen a writer */
filp->f_version = pipe->w_counter;
} else {
- wait_for_partner(inode, &pipe->w_counter);
- if(signal_pending(current))
+ if (wait_for_partner(inode, &pipe->w_counter))
goto err_rd;
}
}
@@ -90,8 +90,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
wake_up_partner(inode);
if (!pipe->readers) {
- wait_for_partner(inode, &pipe->r_counter);
- if (signal_pending(current))
+ if (wait_for_partner(inode, &pipe->r_counter))
goto err_wr;
}
break;
--
1.7.7.6
@@ -0,0 +1,55 @@
From ab68c7b575aff70124f83d2ec207d06c60eea003 Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Mon, 16 Jul 2012 15:17:10 -0700
Subject: [PATCH 059/109] target: Clean up returning errors in PR handling
code
commit d35212f3ca3bf4fb49d15e37f530c9931e2d2183 upstream.
- instead of (PTR_ERR(file) < 0) just use IS_ERR(file)
- return -EINVAL instead of EINVAL
- all other error returns in target_scsi3_emulate_pr_out() use
"goto out" -- get rid of the one remaining straight "return."
Signed-off-by: Roland Dreier <roland@purestorage.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/target/target_core_pr.c | 7 ++++---
1 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c
index b75bc92..9145141 100644
--- a/drivers/target/target_core_pr.c
+++ b/drivers/target/target_core_pr.c
@@ -2042,7 +2042,7 @@ static int __core_scsi3_write_aptpl_to_file(
if (IS_ERR(file) || !file || !file->f_dentry) {
pr_err("filp_open(%s) for APTPL metadata"
" failed\n", path);
- return (PTR_ERR(file) < 0 ? PTR_ERR(file) : -ENOENT);
+ return IS_ERR(file) ? PTR_ERR(file) : -ENOENT;
}
iov[0].iov_base = &buf[0];
@@ -3853,7 +3853,7 @@ int target_scsi3_emulate_pr_out(struct se_task *task)
" SPC-2 reservation is held, returning"
" RESERVATION_CONFLICT\n");
cmd->scsi_sense_reason = TCM_RESERVATION_CONFLICT;
- ret = EINVAL;
+ ret = -EINVAL;
goto out;
}
@@ -3863,7 +3863,8 @@ int target_scsi3_emulate_pr_out(struct se_task *task)
*/
if (!cmd->se_sess) {
cmd->scsi_sense_reason = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
if (cmd->data_length < 24) {
--
1.7.7.6
@@ -0,0 +1,41 @@
From 63a96e329f2c66af145a93d6f42067e54ef932af Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Mon, 16 Jul 2012 17:10:17 -0700
Subject: [PATCH 060/109] target: Fix range calculation in WRITE SAME
emulation when num blocks == 0
commit 1765fe5edcb83f53fc67edeb559fcf4bc82c6460 upstream.
When NUMBER OF LOGICAL BLOCKS is 0, WRITE SAME is supposed to write
all the blocks from the specified LBA through the end of the device.
However, dev->transport->get_blocks(dev) (perhaps confusingly) returns
the last valid LBA rather than the number of blocks, so the correct
number of blocks to write starting with lba is
dev->transport->get_blocks(dev) - lba + 1
(nab: Backport roland's for-3.6 patch to for-3.5)
Signed-off-by: Roland Dreier <roland@purestorage.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/target/target_core_cdb.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/target/target_core_cdb.c b/drivers/target/target_core_cdb.c
index 65ea65a..93b9406 100644
--- a/drivers/target/target_core_cdb.c
+++ b/drivers/target/target_core_cdb.c
@@ -1199,7 +1199,7 @@ int target_emulate_write_same(struct se_task *task)
if (num_blocks != 0)
range = num_blocks;
else
- range = (dev->transport->get_blocks(dev) - lba);
+ range = (dev->transport->get_blocks(dev) - lba) + 1;
pr_debug("WRITE_SAME UNMAP: LBA: %llu Range: %llu\n",
(unsigned long long)lba, (unsigned long long)range);
--
1.7.7.6
@@ -0,0 +1,82 @@
From 0028855e0b717cadb5fc6b05934af9bd9d2cc4c1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 11 Jul 2012 09:09:35 -0400
Subject: [PATCH 061/109] cifs: on CONFIG_HIGHMEM machines, limit the
rsize/wsize to the kmap space
commit 3ae629d98bd5ed77585a878566f04f310adbc591 upstream.
We currently rely on being able to kmap all of the pages in an async
read or write request. If you're on a machine that has CONFIG_HIGHMEM
set then that kmap space is limited, sometimes to as low as 512 slots.
With 512 slots, we can only support up to a 2M r/wsize, and that's
assuming that we can get our greedy little hands on all of them. There
are other users however, so it's possible we'll end up stuck with a
size that large.
Since we can't handle a rsize or wsize larger than that currently, cap
those options at the number of kmap slots we have. We could consider
capping it even lower, but we currently default to a max of 1M. Might as
well allow those luddites on 32 bit arches enough rope to hang
themselves.
A more robust fix would be to teach the send and receive routines how
to contend with an array of pages so we don't need to marshal up a kvec
array at all. That's a fairly significant overhaul though, so we'll need
this limit in place until that's ready.
Reported-by: Jian Li <jiali@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/cifs/connect.c | 18 ++++++++++++++++++
1 files changed, 18 insertions(+), 0 deletions(-)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b21670c..56c152d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2925,6 +2925,18 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
+/*
+ * On hosts with high memory, we can't currently support wsize/rsize that are
+ * larger than we can kmap at once. Cap the rsize/wsize at
+ * LAST_PKMAP * PAGE_SIZE. We'll never be able to fill a read or write request
+ * larger than that anyway.
+ */
+#ifdef CONFIG_HIGHMEM
+#define CIFS_KMAP_SIZE_LIMIT (LAST_PKMAP * PAGE_CACHE_SIZE)
+#else /* CONFIG_HIGHMEM */
+#define CIFS_KMAP_SIZE_LIMIT (1<<24)
+#endif /* CONFIG_HIGHMEM */
+
static unsigned int
cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
{
@@ -2955,6 +2967,9 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
wsize = min_t(unsigned int, wsize,
server->maxBuf - sizeof(WRITE_REQ) + 4);
+ /* limit to the amount that we can kmap at once */
+ wsize = min_t(unsigned int, wsize, CIFS_KMAP_SIZE_LIMIT);
+
/* hard limit of CIFS_MAX_WSIZE */
wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE);
@@ -2996,6 +3011,9 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
if (!(server->capabilities & CAP_LARGE_READ_X))
rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
+ /* limit to the amount that we can kmap at once */
+ rsize = min_t(unsigned int, rsize, CIFS_KMAP_SIZE_LIMIT);
+
/* hard limit of CIFS_MAX_RSIZE */
rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
--
1.7.7.6
@@ -0,0 +1,47 @@
From 3d7e548a161a109e404e1068901f834c69eeb0ea Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 6 Jul 2012 07:09:42 -0400
Subject: [PATCH 062/109] cifs: always update the inode cache with the results
from a FIND_*
commit cd60042cc1392e79410dc8de9e9c1abb38a29e57 upstream.
When we get back a FIND_FIRST/NEXT result, we have some info about the
dentry that we use to instantiate a new inode. We were ignoring and
discarding that info when we had an existing dentry in the cache.
Fix this by updating the inode in place when we find an existing dentry
and the uniqueid is the same.
Reported-and-Tested-by: Andrew Bartlett <abartlet@samba.org>
Reported-by: Bill Robertson <bill_robertson@debortoli.com.au>
Reported-by: Dion Edwards <dion_edwards@debortoli.com.au>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/cifs/readdir.c | 7 +++++--
1 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index db4a138..4c37ed4 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -86,9 +86,12 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
dentry = d_lookup(parent, name);
if (dentry) {
- /* FIXME: check for inode number changes? */
- if (dentry->d_inode != NULL)
+ inode = dentry->d_inode;
+ /* update inode in place if i_ino didn't change */
+ if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
+ cifs_fattr_to_inode(inode, fattr);
return dentry;
+ }
d_drop(dentry);
dput(dentry);
}
--
1.7.7.6
@@ -0,0 +1,84 @@
From 6ece4e48bfa223f77eff8fc4d2fcc4808214f42e Mon Sep 17 00:00:00 2001
From: Aaditya Kumar <aaditya.kumar.30@gmail.com>
Date: Tue, 17 Jul 2012 15:48:07 -0700
Subject: [PATCH 063/109] mm: fix lost kswapd wakeup in kswapd_stop()
commit 1c7e7f6c0703d03af6bcd5ccc11fc15d23e5ecbe upstream.
Offlining memory may block forever, waiting for kswapd() to wake up
because kswapd() does not check the event kthread->should_stop before
sleeping.
The proper pattern, from Documentation/memory-barriers.txt, is:
--- waker ---
event_indicated = 1;
wake_up_process(event_daemon);
--- sleeper ---
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (event_indicated)
break;
schedule();
}
set_current_state() may be wrapped by:
prepare_to_wait();
In the kswapd() case, event_indicated is kthread->should_stop.
=== offlining memory (waker) ===
kswapd_stop()
kthread_stop()
kthread->should_stop = 1
wake_up_process()
wait_for_completion()
=== kswapd_try_to_sleep (sleeper) ===
kswapd_try_to_sleep()
prepare_to_wait()
.
.
schedule()
.
.
finish_wait()
The schedule() needs to be protected by a test of kthread->should_stop,
which is wrapped by kthread_should_stop().
Reproducer:
Do heavy file I/O in background.
Do a memory offline/online in a tight loop
Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
mm/vmscan.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 72cf498..8342119 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2824,7 +2824,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
- schedule();
+
+ if (!kthread_should_stop())
+ schedule();
+
set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
} else {
if (remaining)
--
1.7.7.6
@@ -0,0 +1,164 @@
From 22c2c30192d85ffa042433e89e929b4ea08ab528 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 19 Jul 2012 15:59:18 +1000
Subject: [PATCH 064/109] md: avoid crash when stopping md array races with
closing other open fds.
commit a05b7ea03d72f36edb0cec05e8893803335c61a0 upstream.
md will refuse to stop an array if any other fd (or mounted fs) is
using it.
When any fs is unmounted of when the last open fd is closed all
pending IO will be flushed (e.g. sync_blockdev call in __blkdev_put)
so there will be no pending IO to worry about when the array is
stopped.
However in order to send the STOP_ARRAY ioctl to stop the array one
must first get and open fd on the block device.
If some fd is being used to write to the block device and it is closed
after mdadm open the block device, but before mdadm issues the
STOP_ARRAY ioctl, then there will be no last-close on the md device so
__blkdev_put will not call sync_blockdev.
If this happens, then IO can still be in-flight while md tears down
the array and bad things can happen (use-after-free and subsequent
havoc).
So in the case where do_md_stop is being called from an open file
descriptor, call sync_block after taking the mutex to ensure there
will be no new openers.
This is needed when setting a read-write device to read-only too.
Reported-by: majianpeng <majianpeng@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/md/md.c | 36 +++++++++++++++++++++++-------------
1 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 700ecae..d8646d7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3700,8 +3700,8 @@ array_state_show(struct mddev *mddev, char *page)
return sprintf(page, "%s\n", array_states[st]);
}
-static int do_md_stop(struct mddev * mddev, int ro, int is_open);
-static int md_set_readonly(struct mddev * mddev, int is_open);
+static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev);
+static int md_set_readonly(struct mddev * mddev, struct block_device *bdev);
static int do_md_run(struct mddev * mddev);
static int restart_array(struct mddev *mddev);
@@ -3717,14 +3717,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
/* stopping an active array */
if (atomic_read(&mddev->openers) > 0)
return -EBUSY;
- err = do_md_stop(mddev, 0, 0);
+ err = do_md_stop(mddev, 0, NULL);
break;
case inactive:
/* stopping an active array */
if (mddev->pers) {
if (atomic_read(&mddev->openers) > 0)
return -EBUSY;
- err = do_md_stop(mddev, 2, 0);
+ err = do_md_stop(mddev, 2, NULL);
} else
err = 0; /* already inactive */
break;
@@ -3732,7 +3732,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break; /* not supported yet */
case readonly:
if (mddev->pers)
- err = md_set_readonly(mddev, 0);
+ err = md_set_readonly(mddev, NULL);
else {
mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1);
@@ -3742,7 +3742,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case read_auto:
if (mddev->pers) {
if (mddev->ro == 0)
- err = md_set_readonly(mddev, 0);
+ err = md_set_readonly(mddev, NULL);
else if (mddev->ro == 1)
err = restart_array(mddev);
if (err == 0) {
@@ -5078,15 +5078,17 @@ void md_stop(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(md_stop);
-static int md_set_readonly(struct mddev *mddev, int is_open)
+static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
{
int err = 0;
mutex_lock(&mddev->open_mutex);
- if (atomic_read(&mddev->openers) > is_open) {
+ if (atomic_read(&mddev->openers) > !!bdev) {
printk("md: %s still in use.\n",mdname(mddev));
err = -EBUSY;
goto out;
}
+ if (bdev)
+ sync_blockdev(bdev);
if (mddev->pers) {
__md_stop_writes(mddev);
@@ -5108,18 +5110,26 @@ out:
* 0 - completely stop and dis-assemble array
* 2 - stop but do not disassemble array
*/
-static int do_md_stop(struct mddev * mddev, int mode, int is_open)
+static int do_md_stop(struct mddev * mddev, int mode,
+ struct block_device *bdev)
{
struct gendisk *disk = mddev->gendisk;
struct md_rdev *rdev;
mutex_lock(&mddev->open_mutex);
- if (atomic_read(&mddev->openers) > is_open ||
+ if (atomic_read(&mddev->openers) > !!bdev ||
mddev->sysfs_active) {
printk("md: %s still in use.\n",mdname(mddev));
mutex_unlock(&mddev->open_mutex);
return -EBUSY;
}
+ if (bdev)
+ /* It is possible IO was issued on some other
+ * open file which was closed before we took ->open_mutex.
+ * As that was not the last close __blkdev_put will not
+ * have called sync_blockdev, so we must.
+ */
+ sync_blockdev(bdev);
if (mddev->pers) {
if (mddev->ro)
@@ -5193,7 +5203,7 @@ static void autorun_array(struct mddev *mddev)
err = do_md_run(mddev);
if (err) {
printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
- do_md_stop(mddev, 0, 0);
+ do_md_stop(mddev, 0, NULL);
}
}
@@ -6184,11 +6194,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
goto done_unlock;
case STOP_ARRAY:
- err = do_md_stop(mddev, 0, 1);
+ err = do_md_stop(mddev, 0, bdev);
goto done_unlock;
case STOP_ARRAY_RO:
- err = md_set_readonly(mddev, 1);
+ err = md_set_readonly(mddev, bdev);
goto done_unlock;
case BLKROSET:
--
1.7.7.6
@@ -0,0 +1,58 @@
From a5f676adf9ef247dd5363de5f0e26d0bdb6597bc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 19 Jul 2012 15:59:18 +1000
Subject: [PATCH 065/109] md/raid1: close some possible races on write errors
during resync
commit 58e94ae18478c08229626daece2fc108a4a23261 upstream.
commit 4367af556133723d0f443e14ca8170d9447317cb
md/raid1: clear bad-block record when write succeeds.
Added a 'reschedule_retry' call possibility at the end of
end_sync_write, but didn't add matching code at the end of
sync_request_write. So if the writes complete very quickly, or
scheduling makes it seem that way, then we can miss rescheduling
the request and the resync could hang.
Also commit 73d5c38a9536142e062c35997b044e89166e063b
md: avoid races when stopping resync.
Fix a race condition in this same code in end_sync_write but didn't
make the change in sync_request_write.
This patch updates sync_request_write to fix both of those.
Patch is suitable for 3.1 and later kernels.
Reported-by: Alexander Lyakas <alex.bolshoy@gmail.com>
Original-version-by: Alexander Lyakas <alex.bolshoy@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/md/raid1.c | 10 ++++++++--
1 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 58f0055..2d97bf0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1713,8 +1713,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
if (atomic_dec_and_test(&r1_bio->remaining)) {
/* if we're here, all write(s) have completed, so clean up */
- md_done_sync(mddev, r1_bio->sectors, 1);
- put_buf(r1_bio);
+ int s = r1_bio->sectors;
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, 1);
+ }
}
}
--
1.7.7.6
@@ -0,0 +1,70 @@
From 892d35f24ea2801daef7e48e41d8ec4e9bac34e8 Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Thu, 19 Jul 2012 09:11:14 +0200
Subject: [PATCH 066/109] MIPS: Properly align the .data..init_task section.
commit 7b1c0d26a8e272787f0f9fcc5f3e8531df3b3409 upstream.
Improper alignment can lead to unbootable systems and/or random
crashes.
[ralf@linux-mips.org: This is a lond standing bug since
6eb10bc9e2deab06630261cd05c4cb1e9a60e980 (kernel.org) rsp.
c422a10917f75fd19fa7fe070aaaa23e384dae6f (lmo) [MIPS: Clean up linker script
using new linker script macros.] so dates back to 2.6.32.]
Signed-off-by: David Daney <david.daney@cavium.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/3881/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/mips/include/asm/thread_info.h | 4 ++--
arch/mips/kernel/vmlinux.lds.S | 3 ++-
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index 97f8bf6..adda036 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -60,6 +60,8 @@ struct thread_info {
register struct thread_info *__current_thread_info __asm__("$28");
#define current_thread_info() __current_thread_info
+#endif /* !__ASSEMBLY__ */
+
/* thread information allocation */
#if defined(CONFIG_PAGE_SIZE_4KB) && defined(CONFIG_32BIT)
#define THREAD_SIZE_ORDER (1)
@@ -97,8 +99,6 @@ register struct thread_info *__current_thread_info __asm__("$28");
#define free_thread_info(info) kfree(info)
-#endif /* !__ASSEMBLY__ */
-
#define PREEMPT_ACTIVE 0x10000000
/*
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index a81176f..be281c6 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -1,5 +1,6 @@
#include <asm/asm-offsets.h>
#include <asm/page.h>
+#include <asm/thread_info.h>
#include <asm-generic/vmlinux.lds.h>
#undef mips
@@ -73,7 +74,7 @@ SECTIONS
.data : { /* Data */
. = . + DATAOFFSET; /* for CONFIG_MAPPED_KERNEL */
- INIT_TASK_DATA(PAGE_SIZE)
+ INIT_TASK_DATA(THREAD_SIZE)
NOSAVE_DATA
CACHELINE_ALIGNED_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT)
READ_MOSTLY_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT)
--
1.7.7.6
@@ -0,0 +1,67 @@
From f6ba94c29333fa6df9b3b553415e93bafbd3c831 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@linux.intel.com>
Date: Sat, 14 Jul 2012 14:33:09 +0300
Subject: [PATCH 067/109] UBIFS: fix a bug in empty space fix-up
commit c6727932cfdb13501108b16c38463c09d5ec7a74 upstream.
UBIFS has a feature called "empty space fix-up" which is a quirk to work-around
limitations of dumb flasher programs. Namely, of those flashers that are unable
to skip NAND pages full of 0xFFs while flashing, resulting in empty space at
the end of half-filled eraseblocks to be unusable for UBIFS. This feature is
relatively new (introduced in v3.0).
The fix-up routine (fixup_free_space()) is executed only once at the very first
mount if the superblock has the 'space_fixup' flag set (can be done with -F
option of mkfs.ubifs). It basically reads all the UBIFS data and metadata and
writes it back to the same LEB. The routine assumes the image is pristine and
does not have anything in the journal.
There was a bug in 'fixup_free_space()' where it fixed up the log incorrectly.
All but one LEB of the log of a pristine file-system are empty. And one
contains just a commit start node. And 'fixup_free_space()' just unmapped this
LEB, which resulted in wiping the commit start node. As a result, some users
were unable to mount the file-system next time with the following symptom:
UBIFS error (pid 1): replay_log_leb: first log node at LEB 3:0 is not CS node
UBIFS error (pid 1): replay_log_leb: log error detected while replaying the log at LEB 3:0
The root-cause of this bug was that 'fixup_free_space()' wrongly assumed
that the beginning of empty space in the log head (c->lhead_offs) was known
on mount. However, it is not the case - it was always 0. UBIFS does not store
in it the master node and finds out by scanning the log on every mount.
The fix is simple - just pass commit start node size instead of 0 to
'fixup_leb()'.
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@linux.intel.com>
Reported-by: Iwo Mergler <Iwo.Mergler@netcommwireless.com>
Tested-by: Iwo Mergler <Iwo.Mergler@netcommwireless.com>
Reported-by: James Nute <newten82@gmail.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/ubifs/sb.c | 8 ++++++--
1 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 6094c5a..b73ecd8 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -715,8 +715,12 @@ static int fixup_free_space(struct ubifs_info *c)
lnum = ubifs_next_log_lnum(c, lnum);
}
- /* Fixup the current log head */
- err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
+ /*
+ * Fixup the log head which contains the only a CS node at the
+ * beginning.
+ */
+ err = fixup_leb(c, c->lhead_lnum,
+ ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size));
if (err)
goto out;
--
1.7.7.6
@@ -0,0 +1,211 @@
From b4c39a3690fd0d723f50eba441fe567e8fee68f1 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 8 Jun 2012 01:19:07 +0300
Subject: [PATCH 068/109] ore: Fix NFS crash by supporting any unaligned RAID
IO
commit 9ff19309a9623f2963ac5a136782ea4d8b5d67fb upstream.
In RAID_5/6 We used to not permit an IO that it's end
byte is not stripe_size aligned and spans more than one stripe.
.i.e the caller must check if after submission the actual
transferred bytes is shorter, and would need to resubmit
a new IO with the remainder.
Exofs supports this, and NFS was supposed to support this
as well with it's short write mechanism. But late testing has
exposed a CRASH when this is used with none-RPC layout-drivers.
The change at NFS is deep and risky, in it's place the fix
at ORE to lift the limitation is actually clean and simple.
So here it is below.
The principal here is that in the case of unaligned IO on
both ends, beginning and end, we will send two read requests
one like old code, before the calculation of the first stripe,
and also a new site, before the calculation of the last stripe.
If any "boundary" is aligned or the complete IO is within a single
stripe. we do a single read like before.
The code is clean and simple by splitting the old _read_4_write
into 3 even parts:
1._read_4_write_first_stripe
2. _read_4_write_last_stripe
3. _read_4_write_execute
And calling 1+3 at the same place as before. 2+3 before last
stripe, and in the case of all in a single stripe then 1+2+3
is preformed additively.
Why did I not think of it before. Well I had a strike of
genius because I have stared at this code for 2 years, and did
not find this simple solution, til today. Not that I did not try.
This solution is much better for NFS than the previous supposedly
solution because the short write was dealt with out-of-band after
IO_done, which would cause for a seeky IO pattern where as in here
we execute in order. At both solutions we do 2 separate reads, only
here we do it within a single IO request. (And actually combine two
writes into a single submission)
NFS/exofs code need not change since the ORE API communicates the new
shorter length on return, what will happen is that this case would not
occur anymore.
hurray!!
[Stable this is an NFS bug since 3.2 Kernel should apply cleanly]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/exofs/ore_raid.c | 67 +++++++++++++++++++++++++++-----------------------
1 files changed, 36 insertions(+), 31 deletions(-)
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index d222c77..fff2070 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -461,16 +461,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
* ios->sp2d[p][*], xor is calculated the same way. These pages are
* allocated/freed and don't go through cache
*/
-static int _read_4_write(struct ore_io_state *ios)
+static int _read_4_write_first_stripe(struct ore_io_state *ios)
{
- struct ore_io_state *ios_read;
struct ore_striping_info read_si;
struct __stripe_pages_2d *sp2d = ios->sp2d;
u64 offset = ios->si.first_stripe_start;
- u64 last_stripe_end;
- unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
- unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
- int ret;
+ unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
if (offset == ios->offset) /* Go to start collect $200 */
goto read_last_stripe;
@@ -478,6 +474,9 @@ static int _read_4_write(struct ore_io_state *ios)
min_p = _sp2d_min_pg(sp2d);
max_p = _sp2d_max_pg(sp2d);
+ ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
+ offset, ios->offset, min_p, max_p);
+
for (c = 0; ; c++) {
ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
read_si.obj_offset += min_p * PAGE_SIZE;
@@ -512,6 +511,18 @@ static int _read_4_write(struct ore_io_state *ios)
}
read_last_stripe:
+ return 0;
+}
+
+static int _read_4_write_last_stripe(struct ore_io_state *ios)
+{
+ struct ore_striping_info read_si;
+ struct __stripe_pages_2d *sp2d = ios->sp2d;
+ u64 offset;
+ u64 last_stripe_end;
+ unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+ unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+
offset = ios->offset + ios->length;
if (offset % PAGE_SIZE)
_add_to_r4w_last_page(ios, &offset);
@@ -527,15 +538,15 @@ read_last_stripe:
c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
- BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
- /* unaligned IO must be within a single stripe */
-
if (min_p == sp2d->pages_in_unit) {
/* Didn't do it yet */
min_p = _sp2d_min_pg(sp2d);
max_p = _sp2d_max_pg(sp2d);
}
+ ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
+ offset, last_stripe_end, min_p, max_p);
+
while (offset < last_stripe_end) {
struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
@@ -568,6 +579,15 @@ read_last_stripe:
}
read_it:
+ return 0;
+}
+
+static int _read_4_write_execute(struct ore_io_state *ios)
+{
+ struct ore_io_state *ios_read;
+ unsigned i;
+ int ret;
+
ios_read = ios->ios_read_4_write;
if (!ios_read)
return 0;
@@ -591,6 +611,8 @@ read_it:
}
_mark_read4write_pages_uptodate(ios_read, ret);
+ ore_put_io_state(ios_read);
+ ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */
return 0;
}
@@ -626,8 +648,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
/* If first stripe, Read in all read4write pages
* (if needed) before we calculate the first parity.
*/
- _read_4_write(ios);
+ _read_4_write_first_stripe(ios);
}
+ if (!cur_len) /* If last stripe r4w pages of last stripe */
+ _read_4_write_last_stripe(ios);
+ _read_4_write_execute(ios);
for (i = 0; i < num_pages; i++) {
pages[i] = _raid_page_alloc();
@@ -654,34 +679,14 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
{
- struct ore_layout *layout = ios->layout;
-
if (ios->parity_pages) {
+ struct ore_layout *layout = ios->layout;
unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
- unsigned stripe_size = ios->si.bytes_in_stripe;
- u64 last_stripe, first_stripe;
if (_sp2d_alloc(pages_in_unit, layout->group_width,
layout->parity, &ios->sp2d)) {
return -ENOMEM;
}
-
- /* Round io down to last full strip */
- first_stripe = div_u64(ios->offset, stripe_size);
- last_stripe = div_u64(ios->offset + ios->length, stripe_size);
-
- /* If an IO spans more then a single stripe it must end at
- * a stripe boundary. The reminder at the end is pushed into the
- * next IO.
- */
- if (last_stripe != first_stripe) {
- ios->length = last_stripe * stripe_size - ios->offset;
-
- BUG_ON(!ios->length);
- ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
- PAGE_SIZE;
- ios->si.length = ios->length; /*make it consistent */
- }
}
return 0;
}
--
1.7.7.6
@@ -0,0 +1,50 @@
From a2f43c94b074e0bf567ddc35e17504bbcd237ae1 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 8 Jun 2012 04:30:40 +0300
Subject: [PATCH 069/109] ore: Remove support of partial IO request (NFS
crash)
commit 62b62ad873f2accad9222a4d7ffbe1e93f6714c1 upstream.
Do to OOM situations the ore might fail to allocate all resources
needed for IO of the full request. If some progress was possible
it would proceed with a partial/short request, for the sake of
forward progress.
Since this crashes NFS-core and exofs is just fine without it just
remove this contraption, and fail.
TODO:
Support real forward progress with some reserved allocations
of resources, such as mem pools and/or bio_sets
[Bug since 3.2 Kernel]
CC: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/exofs/ore.c | 8 +-------
1 files changed, 1 insertions(+), 7 deletions(-)
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 49cf230..24a49d4 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -735,13 +735,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
out:
ios->numdevs = devs_in_group;
ios->pages_consumed = cur_pg;
- if (unlikely(ret)) {
- if (length == ios->length)
- return ret;
- else
- ios->length -= length;
- }
- return 0;
+ return ret;
}
int ore_create(struct ore_io_state *ios)
--
1.7.7.6
@@ -0,0 +1,46 @@
From 7b47a0e2114e8530614a25d7ec998fd52d069853 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 8 Jun 2012 05:29:40 +0300
Subject: [PATCH 070/109] pnfs-obj: don't leak objio_state if ore_write/read
fails
commit 9909d45a8557455ca5f8ee7af0f253debc851f1a upstream.
[Bug since 3.2 Kernel]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/nfs/objlayout/objio_osd.c | 9 +++++++--
1 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 55d0128..0e7b3fc 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -433,7 +433,10 @@ int objio_read_pagelist(struct nfs_read_data *rdata)
objios->ios->done = _read_done;
dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
rdata->args.offset, rdata->args.count);
- return ore_read(objios->ios);
+ ret = ore_read(objios->ios);
+ if (unlikely(ret))
+ objio_free_result(&objios->oir);
+ return ret;
}
/*
@@ -517,8 +520,10 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how)
dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
wdata->args.offset, wdata->args.count);
ret = ore_write(objios->ios);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ objio_free_result(&objios->oir);
return ret;
+ }
if (objios->sync)
_write_done(objios->ios, objios);
--
1.7.7.6
@@ -0,0 +1,71 @@
From e4750a0414e24bcd0106493a2f8f251dd02264bf Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 8 Jun 2012 02:02:30 +0300
Subject: [PATCH 071/109] pnfs-obj: Fix __r4w_get_page when offset is beyond
i_size
commit c999ff68029ebd0f56ccae75444f640f6d5a27d2 upstream.
It is very common for the end of the file to be unaligned on
stripe size. But since we know it's beyond file's end then
the XOR should be preformed with all zeros.
Old code used to just read zeros out of the OSD devices, which is a great
waist. But what scares me more about this situation is that, we now have
pages attached to the file's mapping that are beyond i_size. I don't
like the kind of bugs this calls for.
Fix both birds, by returning a global zero_page, if offset is beyond
i_size.
TODO:
Change the API to ->__r4w_get_page() so a NULL can be
returned without being considered as error, since XOR API
treats NULL entries as zero_pages.
[Bug since 3.2. Should apply the same way to all Kernels since]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
[bwh: Backported to 3.2: adjust for lack of wdata->header]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/nfs/objlayout/objio_osd.c | 16 +++++++++++++---
1 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 0e7b3fc..a03ee52 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -467,8 +467,16 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
struct objio_state *objios = priv;
struct nfs_write_data *wdata = objios->oir.rpcdata;
pgoff_t index = offset / PAGE_SIZE;
- struct page *page = find_get_page(wdata->inode->i_mapping, index);
+ struct page *page;
+ loff_t i_size = i_size_read(wdata->inode);
+ if (offset >= i_size) {
+ *uptodate = true;
+ dprintk("%s: g_zero_page index=0x%lx\n", __func__, index);
+ return ZERO_PAGE(0);
+ }
+
+ page = find_get_page(wdata->inode->i_mapping, index);
if (!page) {
page = find_or_create_page(wdata->inode->i_mapping,
index, GFP_NOFS);
@@ -489,8 +497,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
static void __r4w_put_page(void *priv, struct page *page)
{
- dprintk("%s: index=0x%lx\n", __func__, page->index);
- page_cache_release(page);
+ dprintk("%s: index=0x%lx\n", __func__,
+ (page == ZERO_PAGE(0)) ? -1UL : page->index);
+ if (ZERO_PAGE(0) != page)
+ page_cache_release(page);
return;
}
--
1.7.7.6
@@ -0,0 +1,114 @@
From 035afb0de8406d0f820abf43471d51a377add326 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 20 Jul 2012 14:25:03 +0100
Subject: [PATCH 072/109] dm raid1: fix crash with mirror recovery and discard
commit 751f188dd5ab95b3f2b5f2f467c38aae5a2877eb upstream.
This patch fixes a crash when a discard request is sent during mirror
recovery.
Firstly, some background. Generally, the following sequence happens during
mirror synchronization:
- function do_recovery is called
- do_recovery calls dm_rh_recovery_prepare
- dm_rh_recovery_prepare uses a semaphore to limit the number
simultaneously recovered regions (by default the semaphore value is 1,
so only one region at a time is recovered)
- dm_rh_recovery_prepare calls __rh_recovery_prepare,
__rh_recovery_prepare asks the log driver for the next region to
recover. Then, it sets the region state to DM_RH_RECOVERING. If there
are no pending I/Os on this region, the region is added to
quiesced_regions list. If there are pending I/Os, the region is not
added to any list. It is added to the quiesced_regions list later (by
dm_rh_dec function) when all I/Os finish.
- when the region is on quiesced_regions list, there are no I/Os in
flight on this region. The region is popped from the list in
dm_rh_recovery_start function. Then, a kcopyd job is started in the
recover function.
- when the kcopyd job finishes, recovery_complete is called. It calls
dm_rh_recovery_end. dm_rh_recovery_end adds the region to
recovered_regions or failed_recovered_regions list (depending on
whether the copy operation was successful or not).
The above mechanism assumes that if the region is in DM_RH_RECOVERING
state, no new I/Os are started on this region. When I/O is started,
dm_rh_inc_pending is called, which increases reg->pending count. When
I/O is finished, dm_rh_dec is called. It decreases reg->pending count.
If the count is zero and the region was in DM_RH_RECOVERING state,
dm_rh_dec adds it to the quiesced_regions list.
Consequently, if we call dm_rh_inc_pending/dm_rh_dec while the region is
in DM_RH_RECOVERING state, it could be added to quiesced_regions list
multiple times or it could be added to this list when kcopyd is copying
data (it is assumed that the region is not on any list while kcopyd does
its jobs). This results in memory corruption and crash.
There already exist bypasses for REQ_FLUSH requests: REQ_FLUSH requests
do not belong to any region, so they are always added to the sync list
in do_writes. dm_rh_inc_pending does not increase count for REQ_FLUSH
requests. In mirror_end_io, dm_rh_dec is never called for REQ_FLUSH
requests. These bypasses avoid the crash possibility described above.
These bypasses were improperly implemented for REQ_DISCARD when
the mirror target gained discard support in commit
5fc2ffeabb9ee0fc0e71ff16b49f34f0ed3d05b4 (dm raid1: support discard).
In do_writes, REQ_DISCARD requests is always added to the sync queue and
immediately dispatched (even if the region is in DM_RH_RECOVERING). However,
dm_rh_inc and dm_rh_dec is called for REQ_DISCARD resusts. So it violates the
rule that no I/Os are started on DM_RH_RECOVERING regions, and causes the list
corruption described above.
This patch changes it so that REQ_DISCARD requests follow the same path
as REQ_FLUSH. This avoids the crash.
Reference: https://bugzilla.redhat.com/837607
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/md/dm-raid1.c | 2 +-
drivers/md/dm-region-hash.c | 5 ++++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9bfd057..42ef54f 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1210,7 +1210,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
* We need to dec pending if this was a write.
*/
if (rw == WRITE) {
- if (!(bio->bi_rw & REQ_FLUSH))
+ if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)))
dm_rh_dec(ms->rh, map_context->ll);
return error;
}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 7771ed2..69732e0 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -404,6 +404,9 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
return;
}
+ if (bio->bi_rw & REQ_DISCARD)
+ return;
+
/* We must inform the log that the sync count has changed. */
log->type->set_region_sync(log, region, 0);
@@ -524,7 +527,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
struct bio *bio;
for (bio = bios->head; bio; bio = bio->bi_next) {
- if (bio->bi_rw & REQ_FLUSH)
+ if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))
continue;
rh_inc(rh, dm_rh_bio_to_region(rh, bio));
}
--
1.7.7.6
@@ -0,0 +1,41 @@
From e8cf7231ce4f6464f8962ae6ef0421da40ddad15 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 20 Jul 2012 14:25:07 +0100
Subject: [PATCH 073/109] dm raid1: set discard_zeroes_data_unsupported
commit 7c8d3a42fe1c58a7e8fd3f6a013e7d7b474ff931 upstream.
We can't guarantee that REQ_DISCARD on dm-mirror zeroes the data even if
the underlying disks support zero on discard. So this patch sets
ti->discard_zeroes_data_unsupported.
For example, if the mirror is in the process of resynchronizing, it may
happen that kcopyd reads a piece of data, then discard is sent on the
same area and then kcopyd writes the piece of data to another leg.
Consequently, the data is not zeroed.
The flag was made available by commit 983c7db347db8ce2d8453fd1d89b7a4bb6920d56
(dm crypt: always disable discard_zeroes_data).
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/md/dm-raid1.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 42ef54f..dae2b7a 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1080,6 +1080,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->split_io = dm_rh_get_region_size(ms->rh);
ti->num_flush_requests = 1;
ti->num_discard_requests = 1;
+ ti->discard_zeroes_data_unsupported = 1;
ms->kmirrord_wq = alloc_workqueue("kmirrord",
WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
--
1.7.7.6
@@ -0,0 +1,349 @@
From 19aeba1469884ed9a789b143cf73ce047663c095 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 17 Jul 2012 03:05:14 -0400
Subject: [PATCH 074/109] ntp: Fix leap-second hrtimer livelock
This is a backport of 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d
This should have been backported when it was commited, but I
mistook the problem as requiring the ntp_lock changes
that landed in 3.4 in order for it to occur.
Unfortunately the same issue can happen (with only one cpu)
as follows:
do_adjtimex()
write_seqlock_irq(&xtime_lock);
process_adjtimex_modes()
process_adj_status()
ntp_start_leap_timer()
hrtimer_start()
hrtimer_reprogram()
tick_program_event()
clockevents_program_event()
ktime_get()
seq = req_seqbegin(xtime_lock); [DEADLOCK]
This deadlock will no always occur, as it requires the
leap_timer to force a hrtimer_reprogram which only happens
if its set and there's no sooner timer to expire.
NOTE: This patch, being faithful to the original commit,
introduces a bug (we don't update wall_to_monotonic),
which will be resovled by backporting a following fix.
Original commit message below:
Since commit 7dffa3c673fbcf835cd7be80bb4aec8ad3f51168 the ntp
subsystem has used an hrtimer for triggering the leapsecond
adjustment. However, this can cause a potential livelock.
Thomas diagnosed this as the following pattern:
CPU 0 CPU 1
do_adjtimex()
spin_lock_irq(&ntp_lock);
process_adjtimex_modes(); timer_interrupt()
process_adj_status(); do_timer()
ntp_start_leap_timer(); write_lock(&xtime_lock);
hrtimer_start(); update_wall_time();
hrtimer_reprogram(); ntp_tick_length()
tick_program_event() spin_lock(&ntp_lock);
clockevents_program_event()
ktime_get()
seq = req_seqbegin(xtime_lock);
This patch tries to avoid the problem by reverting back to not using
an hrtimer to inject leapseconds, and instead we handle the leapsecond
processing in the second_overflow() function.
The downside to this change is that on systems that support highres
timers, the leap second processing will occur on a HZ tick boundary,
(ie: ~1-10ms, depending on HZ) after the leap second instead of
possibly sooner (~34us in my tests w/ x86_64 lapic).
This patch applies on top of tip/timers/core.
CC: Sasha Levin <levinsasha928@gmail.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Sasha Levin <levinsasha928@gmail.com>
Diagnoised-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
include/linux/timex.h | 2 +-
kernel/time/ntp.c | 122 +++++++++++++++------------------------------
kernel/time/timekeeping.c | 18 +++----
3 files changed, 48 insertions(+), 94 deletions(-)
diff --git a/include/linux/timex.h b/include/linux/timex.h
index aa60fe7..08e90fb 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -266,7 +266,7 @@ static inline int ntp_synced(void)
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
extern u64 tick_length;
-extern void second_overflow(void);
+extern int second_overflow(unsigned long secs);
extern void update_ntp_one_tick(void);
extern int do_adjtimex(struct timex *);
extern void hardpps(const struct timespec *, const struct timespec *);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4b85a7a..4508f7f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,8 +31,6 @@ unsigned long tick_nsec;
u64 tick_length;
static u64 tick_length_base;
-static struct hrtimer leap_timer;
-
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -350,60 +348,60 @@ void ntp_clear(void)
}
/*
- * Leap second processing. If in leap-insert state at the end of the
- * day, the system clock is set back one second; if in leap-delete
- * state, the system clock is set ahead one second.
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ * Also handles leap second processing, and returns leap offset
*/
-static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
+int second_overflow(unsigned long secs)
{
- enum hrtimer_restart res = HRTIMER_NORESTART;
-
- write_seqlock(&xtime_lock);
+ int leap = 0;
+ s64 delta;
+ /*
+ * Leap second processing. If in leap-insert state at the end of the
+ * day, the system clock is set back one second; if in leap-delete
+ * state, the system clock is set ahead one second.
+ */
switch (time_state) {
case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
break;
case TIME_INS:
- timekeeping_leap_insert(-1);
- time_state = TIME_OOP;
- printk(KERN_NOTICE
- "Clock: inserting leap second 23:59:60 UTC\n");
- hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
- res = HRTIMER_RESTART;
+ if (secs % 86400 == 0) {
+ leap = -1;
+ time_state = TIME_OOP;
+ printk(KERN_NOTICE
+ "Clock: inserting leap second 23:59:60 UTC\n");
+ }
break;
case TIME_DEL:
- timekeeping_leap_insert(1);
- time_tai--;
- time_state = TIME_WAIT;
- printk(KERN_NOTICE
- "Clock: deleting leap second 23:59:59 UTC\n");
+ if ((secs + 1) % 86400 == 0) {
+ leap = 1;
+ time_tai--;
+ time_state = TIME_WAIT;
+ printk(KERN_NOTICE
+ "Clock: deleting leap second 23:59:59 UTC\n");
+ }
break;
case TIME_OOP:
time_tai++;
time_state = TIME_WAIT;
- /* fall through */
+ break;
+
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
break;
}
- write_sequnlock(&xtime_lock);
-
- return res;
-}
-
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- */
-void second_overflow(void)
-{
- s64 delta;
/* Bump the maxerror field */
time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -423,23 +421,25 @@ void second_overflow(void)
pps_dec_valid();
if (!time_adjust)
- return;
+ goto out;
if (time_adjust > MAX_TICKADJ) {
time_adjust -= MAX_TICKADJ;
tick_length += MAX_TICKADJ_SCALED;
- return;
+ goto out;
}
if (time_adjust < -MAX_TICKADJ) {
time_adjust += MAX_TICKADJ;
tick_length -= MAX_TICKADJ_SCALED;
- return;
+ goto out;
}
tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
<< NTP_SCALE_SHIFT;
time_adjust = 0;
+out:
+ return leap;
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -501,27 +501,6 @@ static void notify_cmos_timer(void)
static inline void notify_cmos_timer(void) { }
#endif
-/*
- * Start the leap seconds timer:
- */
-static inline void ntp_start_leap_timer(struct timespec *ts)
-{
- long now = ts->tv_sec;
-
- if (time_status & STA_INS) {
- time_state = TIME_INS;
- now += 86400 - now % 86400;
- hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
-
- return;
- }
-
- if (time_status & STA_DEL) {
- time_state = TIME_DEL;
- now += 86400 - (now + 1) % 86400;
- hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
- }
-}
/*
* Propagate a new txc->status value into the NTP state:
@@ -546,22 +525,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
time_status &= STA_RONLY;
time_status |= txc->status & ~STA_RONLY;
- switch (time_state) {
- case TIME_OK:
- ntp_start_leap_timer(ts);
- break;
- case TIME_INS:
- case TIME_DEL:
- time_state = TIME_OK;
- ntp_start_leap_timer(ts);
- case TIME_WAIT:
- if (!(time_status & (STA_INS | STA_DEL)))
- time_state = TIME_OK;
- break;
- case TIME_OOP:
- hrtimer_restart(&leap_timer);
- break;
- }
}
/*
* Called with the xtime lock held, so we can access and modify
@@ -643,9 +606,6 @@ int do_adjtimex(struct timex *txc)
(txc->tick < 900000/USER_HZ ||
txc->tick > 1100000/USER_HZ))
return -EINVAL;
-
- if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
- hrtimer_cancel(&leap_timer);
}
if (txc->modes & ADJ_SETOFFSET) {
@@ -967,6 +927,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
ntp_clear();
- hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- leap_timer.function = ntp_leap_second;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2378413..4780a7d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -169,15 +169,6 @@ static struct timespec raw_time;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
-/* must hold xtime_lock */
-void timekeeping_leap_insert(int leapsecond)
-{
- xtime.tv_sec += leapsecond;
- wall_to_monotonic.tv_sec -= leapsecond;
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
-}
-
/**
* timekeeping_forward_now - update clock to the current time
*
@@ -942,9 +933,11 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
while (timekeeper.xtime_nsec >= nsecps) {
+ int leap;
timekeeper.xtime_nsec -= nsecps;
xtime.tv_sec++;
- second_overflow();
+ leap = second_overflow(xtime.tv_sec);
+ xtime.tv_sec += leap;
}
/* Accumulate raw time */
@@ -1050,9 +1043,12 @@ static void update_wall_time(void)
* xtime.tv_nsec isn't larger then NSEC_PER_SEC
*/
if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
+ int leap;
xtime.tv_nsec -= NSEC_PER_SEC;
xtime.tv_sec++;
- second_overflow();
+ leap = second_overflow(xtime.tv_sec);
+ xtime.tv_sec += leap;
+
}
/* check to see if there is a new clocksource to use */
--
1.7.7.6
@@ -0,0 +1,42 @@
From 106227da17ad8a279e7e104b8592ada4e81dbd8b Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Thu, 26 Apr 2012 14:11:32 +0200
Subject: [PATCH 075/109] ntp: Correct TAI offset during leap second
commit dd48d708ff3e917f6d6b6c2b696c3f18c019feed upstream.
When repeating a UTC time value during a leap second (when the UTC
time should be 23:59:60), the TAI timescale should not stop. The kernel
NTP code increments the TAI offset one second too late. This patch fixes
the issue by incrementing the offset during the leap second itself.
Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
kernel/time/ntp.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4508f7f..f1eb182 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -378,6 +378,7 @@ int second_overflow(unsigned long secs)
if (secs % 86400 == 0) {
leap = -1;
time_state = TIME_OOP;
+ time_tai++;
printk(KERN_NOTICE
"Clock: inserting leap second 23:59:60 UTC\n");
}
@@ -392,7 +393,6 @@ int second_overflow(unsigned long secs)
}
break;
case TIME_OOP:
- time_tai++;
time_state = TIME_WAIT;
break;
--
1.7.7.6
@@ -0,0 +1,53 @@
From 7a063ddaad98f05f7976e0e8c9c1455cc9d0f5da Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 30 May 2012 10:54:57 -0700
Subject: [PATCH 076/109] timekeeping: Fix CLOCK_MONOTONIC inconsistency
during leapsecond
This is a backport of fad0c66c4bb836d57a5f125ecd38bed653ca863a
which resolves a bug the previous commit.
Commit 6b43ae8a61 (ntp: Fix leap-second hrtimer livelock) broke the
leapsecond update of CLOCK_MONOTONIC. The missing leapsecond update to
wall_to_monotonic causes discontinuities in CLOCK_MONOTONIC.
Adjust wall_to_monotonic when NTP inserted a leapsecond.
Reported-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Tested-by: Richard Cochran <richardcochran@gmail.com>
Link: http://lkml.kernel.org/r/1338400497-12420-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
kernel/time/timekeeping.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4780a7d..5c9b67e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -938,6 +938,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
xtime.tv_sec++;
leap = second_overflow(xtime.tv_sec);
xtime.tv_sec += leap;
+ wall_to_monotonic.tv_sec -= leap;
}
/* Accumulate raw time */
@@ -1048,7 +1049,7 @@ static void update_wall_time(void)
xtime.tv_sec++;
leap = second_overflow(xtime.tv_sec);
xtime.tv_sec += leap;
-
+ wall_to_monotonic.tv_sec -= leap;
}
/* check to see if there is a new clocksource to use */
--
1.7.7.6
@@ -0,0 +1,99 @@
From 540e83f9da352839ff29ce5445fc499de8d54570 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 13 Nov 2011 23:19:49 +0000
Subject: [PATCH 077/109] time: Move common updates to a function
This is a backport of cc06268c6a87db156af2daed6e96a936b955cc82
[John Stultz: While not a bugfix itself, it allows following fixes
to backport in a more straightforward manner.]
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
kernel/time/timekeeping.c | 34 +++++++++++++++++-----------------
1 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5c9b67e..5d55185 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -166,6 +166,19 @@ static struct timespec total_sleep_time;
*/
static struct timespec raw_time;
+/* must hold write on xtime_lock */
+static void timekeeping_update(bool clearntp)
+{
+ if (clearntp) {
+ timekeeper.ntp_error = 0;
+ ntp_clear();
+ }
+ update_vsyscall(&xtime, &wall_to_monotonic,
+ timekeeper.clock, timekeeper.mult);
+}
+
+
+
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -366,11 +379,7 @@ int do_settimeofday(const struct timespec *tv)
xtime = *tv;
- timekeeper.ntp_error = 0;
- ntp_clear();
-
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ timekeeping_update(true);
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -403,11 +412,7 @@ int timekeeping_inject_offset(struct timespec *ts)
xtime = timespec_add(xtime, *ts);
wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
- timekeeper.ntp_error = 0;
- ntp_clear();
-
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ timekeeping_update(true);
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -636,10 +641,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
__timekeeping_inject_sleeptime(delta);
- timekeeper.ntp_error = 0;
- ntp_clear();
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ timekeeping_update(true);
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -1052,9 +1054,7 @@ static void update_wall_time(void)
wall_to_monotonic.tv_sec -= leap;
}
- /* check to see if there is a new clocksource to use */
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ timekeeping_update(false);
}
/**
--
1.7.7.6
@@ -0,0 +1,118 @@
From b6da5d5a3a7e128fd17b15dc64fda7c1aea694e6 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 10 Jul 2012 18:43:19 -0400
Subject: [PATCH 078/109] hrtimer: Provide clock_was_set_delayed()
commit f55a6faa384304c89cfef162768e88374d3312cb upstream.
clock_was_set() cannot be called from hard interrupt context because
it calls on_each_cpu().
For fixing the widely reported leap seconds issue it is necessary to
call it from hard interrupt context, i.e. the timer tick code, which
does the timekeeping updates.
Provide a new function which denotes it in the hrtimer cpu base
structure of the cpu on which it is called and raise the hrtimer
softirq. We then execute the clock_was_set() notificiation from
softirq context in run_hrtimer_softirq(). The hrtimer softirq is
rarely used, so polling the flag there is not a performance issue.
[ tglx: Made it depend on CONFIG_HIGH_RES_TIMERS. We really should get
rid of all this ifdeffery ASAP ]
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reported-by: Jan Engelhardt <jengelh@inai.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1341960205-56738-2-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
include/linux/hrtimer.h | 9 ++++++++-
kernel/hrtimer.c | 20 ++++++++++++++++++++
2 files changed, 28 insertions(+), 1 deletions(-)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fd0dc30..c9ec940 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -165,6 +165,7 @@ enum hrtimer_base_type {
* @lock: lock protecting the base and associated clock bases
* and timers
* @active_bases: Bitfield to mark bases with active timers
+ * @clock_was_set: Indicates that clock was set from irq context.
* @expires_next: absolute time of the next event which was scheduled
* via clock_set_next_event()
* @hres_active: State of high resolution mode
@@ -177,7 +178,8 @@ enum hrtimer_base_type {
*/
struct hrtimer_cpu_base {
raw_spinlock_t lock;
- unsigned long active_bases;
+ unsigned int active_bases;
+ unsigned int clock_was_set;
#ifdef CONFIG_HIGH_RES_TIMERS
ktime_t expires_next;
int hres_active;
@@ -286,6 +288,8 @@ extern void hrtimer_peek_ahead_timers(void);
# define MONOTONIC_RES_NSEC HIGH_RES_NSEC
# define KTIME_MONOTONIC_RES KTIME_HIGH_RES
+extern void clock_was_set_delayed(void);
+
#else
# define MONOTONIC_RES_NSEC LOW_RES_NSEC
@@ -306,6 +310,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
{
return 0;
}
+
+static inline void clock_was_set_delayed(void) { }
+
#endif
extern void clock_was_set(void);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf5..3c24fb2 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -717,6 +717,19 @@ static int hrtimer_switch_to_hres(void)
return 1;
}
+/*
+ * Called from timekeeping code to reprogramm the hrtimer interrupt
+ * device. If called from the timer interrupt context we defer it to
+ * softirq context.
+ */
+void clock_was_set_delayed(void)
+{
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+ cpu_base->clock_was_set = 1;
+ __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
+
#else
static inline int hrtimer_hres_active(void) { return 0; }
@@ -1395,6 +1408,13 @@ void hrtimer_peek_ahead_timers(void)
static void run_hrtimer_softirq(struct softirq_action *h)
{
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+ if (cpu_base->clock_was_set) {
+ cpu_base->clock_was_set = 0;
+ clock_was_set();
+ }
+
hrtimer_peek_ahead_timers();
}
--
1.7.7.6
@@ -0,0 +1,66 @@
From 61642041369832a7f8d29d27fcd9e88d523f3163 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 10 Jul 2012 18:43:20 -0400
Subject: [PATCH 079/109] timekeeping: Fix leapsecond triggered load spike
issue
This is a backport of 4873fa070ae84a4115f0b3c9dfabc224f1bc7c51
The timekeeping code misses an update of the hrtimer subsystem after a
leap second happened. Due to that timers based on CLOCK_REALTIME are
either expiring a second early or late depending on whether a leap
second has been inserted or deleted until an operation is initiated
which causes that update. Unless the update happens by some other
means this discrepancy between the timekeeping and the hrtimer data
stays forever and timers are expired either early or late.
The reported immediate workaround - $ data -s "`date`" - is causing a
call to clock_was_set() which updates the hrtimer data structures.
See: http://www.sheeri.com/content/mysql-and-leap-second-high-cpu-and-fix
Add the missing clock_was_set() call to update_wall_time() in case of
a leap second event. The actual update is deferred to softirq context
as the necessary smp function call cannot be invoked from hard
interrupt context.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reported-by: Jan Engelhardt <jengelh@inai.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1341960205-56738-3-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
kernel/time/timekeeping.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5d55185..8958ad7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -941,6 +941,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
leap = second_overflow(xtime.tv_sec);
xtime.tv_sec += leap;
wall_to_monotonic.tv_sec -= leap;
+ if (leap)
+ clock_was_set_delayed();
}
/* Accumulate raw time */
@@ -1052,6 +1054,8 @@ static void update_wall_time(void)
leap = second_overflow(xtime.tv_sec);
xtime.tv_sec += leap;
wall_to_monotonic.tv_sec -= leap;
+ if (leap)
+ clock_was_set_delayed();
}
timekeeping_update(false);
--
1.7.7.6
@@ -0,0 +1,104 @@
From 711cebfd4050d5a41606f9f8ad56986d0377df08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Jul 2012 18:43:21 -0400
Subject: [PATCH 080/109] timekeeping: Maintain ktime_t based offsets for
hrtimers
This is a backport of 5b9fe759a678e05be4937ddf03d50e950207c1c0
We need to update the hrtimer clock offsets from the hrtimer interrupt
context. To avoid conversions from timespec to ktime_t maintain a
ktime_t based representation of those offsets in the timekeeper. This
puts the conversion overhead into the code which updates the
underlying offsets and provides fast accessible values in the hrtimer
interrupt.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1341960205-56738-4-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[John Stultz: Backported to 3.2]
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
kernel/time/timekeeping.c | 25 ++++++++++++++++++++++++-
1 files changed, 24 insertions(+), 1 deletions(-)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8958ad7..d5d0e5d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -161,18 +161,34 @@ static struct timespec xtime __attribute__ ((aligned (16)));
static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
static struct timespec total_sleep_time;
+/* Offset clock monotonic -> clock realtime */
+static ktime_t offs_real;
+
+/* Offset clock monotonic -> clock boottime */
+static ktime_t offs_boot;
+
/*
* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
*/
static struct timespec raw_time;
/* must hold write on xtime_lock */
+static void update_rt_offset(void)
+{
+ struct timespec tmp, *wtm = &wall_to_monotonic;
+
+ set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
+ offs_real = timespec_to_ktime(tmp);
+}
+
+/* must hold write on xtime_lock */
static void timekeeping_update(bool clearntp)
{
if (clearntp) {
timekeeper.ntp_error = 0;
ntp_clear();
}
+ update_rt_offset();
update_vsyscall(&xtime, &wall_to_monotonic,
timekeeper.clock, timekeeper.mult);
}
@@ -587,6 +603,7 @@ void __init timekeeping_init(void)
}
set_normalized_timespec(&wall_to_monotonic,
-boot.tv_sec, -boot.tv_nsec);
+ update_rt_offset();
total_sleep_time.tv_sec = 0;
total_sleep_time.tv_nsec = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -595,6 +612,12 @@ void __init timekeeping_init(void)
/* time in seconds when suspend began */
static struct timespec timekeeping_suspend_time;
+static void update_sleep_time(struct timespec t)
+{
+ total_sleep_time = t;
+ offs_boot = timespec_to_ktime(t);
+}
+
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
* @delta: pointer to a timespec delta value
@@ -612,7 +635,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
xtime = timespec_add(xtime, *delta);
wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
- total_sleep_time = timespec_add(total_sleep_time, *delta);
+ update_sleep_time(timespec_add(total_sleep_time, *delta));
}
--
1.7.7.6
@@ -0,0 +1,61 @@
From 6783d1f7e46f4450b489d970bbf4d62db9296c1f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Jul 2012 18:43:23 -0400
Subject: [PATCH 081/109] hrtimers: Move lock held region in
hrtimer_interrupt()
commit 196951e91262fccda81147d2bcf7fdab08668b40 upstream.
We need to update the base offsets from this code and we need to do
that under base->lock. Move the lock held region around the
ktime_get() calls. The ktime_get() calls are going to be replaced with
a function which gets the time and the offsets atomically.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/1341960205-56738-6-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
kernel/hrtimer.c | 5 +++--
1 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3c24fb2..8f320af 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1263,11 +1263,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
cpu_base->nr_events++;
dev->next_event.tv64 = KTIME_MAX;
+ raw_spin_lock(&cpu_base->lock);
entry_time = now = ktime_get();
retry:
expires_next.tv64 = KTIME_MAX;
-
- raw_spin_lock(&cpu_base->lock);
/*
* We set expires_next to KTIME_MAX here with cpu_base->lock
* held to prevent that a timer is enqueued in our queue via
@@ -1344,6 +1343,7 @@ retry:
* interrupt routine. We give it 3 attempts to avoid
* overreacting on some spurious event.
*/
+ raw_spin_lock(&cpu_base->lock);
now = ktime_get();
cpu_base->nr_retries++;
if (++retries < 3)
@@ -1356,6 +1356,7 @@ retry:
*/
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
+ raw_spin_unlock(&cpu_base->lock);
delta = ktime_sub(now, entry_time);
if (delta.tv64 > cpu_base->max_hang_time.tv64)
cpu_base->max_hang_time = delta;
--
1.7.7.6
@@ -0,0 +1,94 @@
From d6cadfb2bf29a5913562d3f63c49a937bc98540d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Jul 2012 18:43:24 -0400
Subject: [PATCH 082/109] timekeeping: Provide hrtimer update function
This is a backport of f6c06abfb3972ad4914cef57d8348fcb2932bc3b
To finally fix the infamous leap second issue and other race windows
caused by functions which change the offsets between the various time
bases (CLOCK_MONOTONIC, CLOCK_REALTIME and CLOCK_BOOTTIME) we need a
function which atomically gets the current monotonic time and updates
the offsets of CLOCK_REALTIME and CLOCK_BOOTTIME with minimalistic
overhead. The previous patch which provides ktime_t offsets allows us
to make this function almost as cheap as ktime_get() which is going to
be replaced in hrtimer_interrupt().
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/1341960205-56738-7-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[John Stultz: Backported to 3.2]
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
include/linux/hrtimer.h | 1 +
kernel/time/timekeeping.c | 34 ++++++++++++++++++++++++++++++++++
2 files changed, 35 insertions(+), 0 deletions(-)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index c9ec940..cc07d27 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -327,6 +327,7 @@ extern ktime_t ktime_get(void);
extern ktime_t ktime_get_real(void);
extern ktime_t ktime_get_boottime(void);
extern ktime_t ktime_get_monotonic_offset(void);
+extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot);
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d5d0e5d..4938c5e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1240,6 +1240,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
} while (read_seqretry(&xtime_lock, seq));
}
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * ktime_get_update_offsets - hrtimer helper
+ * @real: pointer to storage for monotonic -> realtime offset
+ * @_boot: pointer to storage for monotonic -> boottime offset
+ *
+ * Returns current monotonic time and updates the offsets
+ * Called from hrtimer_interupt() or retrigger_next_event()
+ */
+ktime_t ktime_get_update_offsets(ktime_t *real, ktime_t *boot)
+{
+ ktime_t now;
+ unsigned int seq;
+ u64 secs, nsecs;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ secs = xtime.tv_sec;
+ nsecs = xtime.tv_nsec;
+ nsecs += timekeeping_get_ns();
+ /* If arch requires, add in gettimeoffset() */
+ nsecs += arch_gettimeoffset();
+
+ *real = offs_real;
+ *boot = offs_boot;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ now = ktime_add_ns(ktime_set(secs, 0), nsecs);
+ now = ktime_sub(now, *real);
+ return now;
+}
+#endif
+
/**
* ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
*/
--
1.7.7.6
@@ -0,0 +1,125 @@
From 532c526ac9349430b41f6a16f32fc808c4270647 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 10 Jul 2012 18:43:25 -0400
Subject: [PATCH 083/109] hrtimer: Update hrtimer base offsets each
hrtimer_interrupt
commit 5baefd6d84163443215f4a99f6a20f054ef11236 upstream.
The update of the hrtimer base offsets on all cpus cannot be made
atomically from the timekeeper.lock held and interrupt disabled region
as smp function calls are not allowed there.
clock_was_set(), which enforces the update on all cpus, is called
either from preemptible process context in case of do_settimeofday()
or from the softirq context when the offset modification happened in
the timer interrupt itself due to a leap second.
In both cases there is a race window for an hrtimer interrupt between
dropping timekeeper lock, enabling interrupts and clock_was_set()
issuing the updates. Any interrupt which arrives in that window will
see the new time but operate on stale offsets.
So we need to make sure that an hrtimer interrupt always sees a
consistent state of time and offsets.
ktime_get_update_offsets() allows us to get the current monotonic time
and update the per cpu hrtimer base offsets from hrtimer_interrupt()
to capture a consistent state of monotonic time and the offsets. The
function replaces the existing ktime_get() calls in hrtimer_interrupt().
The overhead of the new function vs. ktime_get() is minimal as it just
adds two store operations.
This ensures that any changes to realtime or boottime offsets are
noticed and stored into the per-cpu hrtimer base structures, prior to
any hrtimer expiration and guarantees that timers are not expired early.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1341960205-56738-8-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
kernel/hrtimer.c | 28 ++++++++++++++--------------
1 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 8f320af..6db7a5e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
return 0;
}
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+ ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+ ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+
+ return ktime_get_update_offsets(offs_real, offs_boot);
+}
+
/*
* Retrigger next event is called after clock was set
*
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
- struct timespec realtime_offset, xtim, wtm, sleep;
if (!hrtimer_hres_active())
return;
- /* Optimized out for !HIGH_RES */
- get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
- set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-
- /* Adjust CLOCK_REALTIME offset */
raw_spin_lock(&base->lock);
- base->clock_base[HRTIMER_BASE_REALTIME].offset =
- timespec_to_ktime(realtime_offset);
- base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
- timespec_to_ktime(sleep);
-
+ hrtimer_update_base(base);
hrtimer_force_reprogram(base, 0);
raw_spin_unlock(&base->lock);
}
@@ -710,7 +708,6 @@ static int hrtimer_switch_to_hres(void)
base->clock_base[i].resolution = KTIME_HIGH_RES;
tick_setup_sched_timer();
-
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
local_irq_restore(flags);
@@ -1264,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
dev->next_event.tv64 = KTIME_MAX;
raw_spin_lock(&cpu_base->lock);
- entry_time = now = ktime_get();
+ entry_time = now = hrtimer_update_base(cpu_base);
retry:
expires_next.tv64 = KTIME_MAX;
/*
@@ -1342,9 +1339,12 @@ retry:
* We need to prevent that we loop forever in the hrtimer
* interrupt routine. We give it 3 attempts to avoid
* overreacting on some spurious event.
+ *
+ * Acquire base lock for updating the offsets and retrieving
+ * the current time.
*/
raw_spin_lock(&cpu_base->lock);
- now = ktime_get();
+ now = hrtimer_update_base(cpu_base);
cpu_base->nr_retries++;
if (++retries < 3)
goto retry;
--
1.7.7.6
@@ -0,0 +1,57 @@
From 666e725f56fc4c9a6e4f0e00b5a180866863f724 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Jul 2012 12:50:42 -0400
Subject: [PATCH 084/109] timekeeping: Add missing update call in
timekeeping_resume()
This is a backport of 3e997130bd2e8c6f5aaa49d6e3161d4d29b43ab0
The leap second rework unearthed another issue of inconsistent data.
On timekeeping_resume() the timekeeper data is updated, but nothing
calls timekeeping_update(), so now the update code in the timer
interrupt sees stale values.
This has been the case before those changes, but then the timer
interrupt was using stale data as well so this went unnoticed for quite
some time.
Add the missing update call, so all the data is consistent everywhere.
Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Reported-and-tested-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Reported-and-tested-by: Martin Steigerwald <Martin@lichtvoll.de>
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Linux PM list <linux-pm@vger.kernel.org>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
Cc: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[John Stultz: Backported to 3.2]
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
kernel/time/timekeeping.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4938c5e..03e67d4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -699,6 +699,7 @@ static void timekeeping_resume(void)
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
timekeeper.ntp_error = 0;
timekeeping_suspended = 0;
+ timekeeping_update(false);
write_sequnlock_irqrestore(&xtime_lock, flags);
touch_softlockup_watchdog();
--
1.7.7.6
@@ -0,0 +1,87 @@
From ec9436c865d11ebd0fd6909a9ef2a63e5536ff29 Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@linux-m68k.org>
Date: Fri, 9 Dec 2011 11:35:08 +0000
Subject: [PATCH 085/109] powerpc: Fix wrong divisor in usecs_to_cputime
commit 9f5072d4f63f28d30d343573830ac6c85fc0deff upstream.
Commit d57af9b (taskstats: use real microsecond granularity for CPU times)
renamed msecs_to_cputime to usecs_to_cputime, but failed to update all
numbers on the way. This causes nonsensical cpu idle/iowait values to be
displayed in /proc/stat (the only user of usecs_to_cputime so far).
This also renames __cputime_msec_factor to __cputime_usec_factor, adapting
its value and using it directly in cputime_to_usecs instead of doing two
multiplications.
Signed-off-by: Andreas Schwab <schwab@linux-m68k.org>
Acked-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/powerpc/include/asm/cputime.h | 6 +++---
arch/powerpc/kernel/time.c | 10 +++++-----
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 98b7c4b..fa3f921 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -126,11 +126,11 @@ static inline u64 cputime64_to_jiffies64(const cputime_t ct)
/*
* Convert cputime <-> microseconds
*/
-extern u64 __cputime_msec_factor;
+extern u64 __cputime_usec_factor;
static inline unsigned long cputime_to_usecs(const cputime_t ct)
{
- return mulhdu(ct, __cputime_msec_factor) * USEC_PER_MSEC;
+ return mulhdu(ct, __cputime_usec_factor);
}
static inline cputime_t usecs_to_cputime(const unsigned long us)
@@ -143,7 +143,7 @@ static inline cputime_t usecs_to_cputime(const unsigned long us)
sec = us / 1000000;
if (ct) {
ct *= tb_ticks_per_sec;
- do_div(ct, 1000);
+ do_div(ct, 1000000);
}
if (sec)
ct += (cputime_t) sec * tb_ticks_per_sec;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 5db163c..ec8affe 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -168,13 +168,13 @@ EXPORT_SYMBOL_GPL(ppc_tb_freq);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
/*
* Factors for converting from cputime_t (timebase ticks) to
- * jiffies, milliseconds, seconds, and clock_t (1/USER_HZ seconds).
+ * jiffies, microseconds, seconds, and clock_t (1/USER_HZ seconds).
* These are all stored as 0.64 fixed-point binary fractions.
*/
u64 __cputime_jiffies_factor;
EXPORT_SYMBOL(__cputime_jiffies_factor);
-u64 __cputime_msec_factor;
-EXPORT_SYMBOL(__cputime_msec_factor);
+u64 __cputime_usec_factor;
+EXPORT_SYMBOL(__cputime_usec_factor);
u64 __cputime_sec_factor;
EXPORT_SYMBOL(__cputime_sec_factor);
u64 __cputime_clockt_factor;
@@ -192,8 +192,8 @@ static void calc_cputime_factors(void)
div128_by_32(HZ, 0, tb_ticks_per_sec, &res);
__cputime_jiffies_factor = res.result_low;
- div128_by_32(1000, 0, tb_ticks_per_sec, &res);
- __cputime_msec_factor = res.result_low;
+ div128_by_32(1000000, 0, tb_ticks_per_sec, &res);
+ __cputime_usec_factor = res.result_low;
div128_by_32(1, 0, tb_ticks_per_sec, &res);
__cputime_sec_factor = res.result_low;
div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res);
--
1.7.7.6
@@ -0,0 +1,60 @@
From 164965e103d2cfc11c59b563aa95ce2e8c372b65 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@math.technion.ac.il>
Date: Mon, 27 Feb 2012 15:07:29 +0200
Subject: [PATCH 086/109] vhost: don't forget to schedule()
commit d550dda192c1bd039afb774b99485e88b70d7cb8 upstream.
This is a tiny, but important, patch to vhost.
Vhost's worker thread only called schedule() when it had no work to do, and
it wanted to go to sleep. But if there's always work to do, e.g., the guest
is running a network-intensive program like netperf with small message sizes,
schedule() was *never* called. This had several negative implications (on
non-preemptive kernels):
1. Passing time was not properly accounted to the "vhost" process (ps and
top would wrongly show it using zero CPU time).
2. Sometimes error messages about RCU timeouts would be printed, if the
core running the vhost thread didn't schedule() for a very long time.
3. Worst of all, a vhost thread would "hog" the core. If several vhost
threads need to share the same core, typically one would get most of the
CPU time (and its associated guest most of the performance), while the
others hardly get any work done.
The trivial solution is to add
if (need_resched())
schedule();
After doing every piece of work. This will not do the heavy schedule() all
the time, just when the timer interrupt decided a reschedule is warranted
(so need_resched returns true).
Thanks to Abel Gordon for this patch.
Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/vhost/vhost.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c14c42b..ae66278 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -222,6 +222,8 @@ static int vhost_worker(void *data)
if (work) {
__set_current_state(TASK_RUNNING);
work->fn(work);
+ if (need_resched())
+ schedule();
} else
schedule();
--
1.7.7.6
@@ -0,0 +1,43 @@
From 87b62a139d4385f3726820674127eaee29fc7cff Mon Sep 17 00:00:00 2001
From: Devendra Naga <devendra.aaru@gmail.com>
Date: Thu, 31 May 2012 01:51:20 +0000
Subject: [PATCH 087/109] r8169: call netif_napi_del at errpaths and at driver
unload
commit ad1be8d345416a794dea39761a374032aa471a76 upstream.
when register_netdev fails, the init'ed NAPIs by netif_napi_add must be
deleted with netif_napi_del, and also when driver unloads, it should
delete the NAPI before unregistering netdevice using unregister_netdev.
Signed-off-by: Devendra Naga <devendra.aaru@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/ethernet/realtek/r8169.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index cc2565c..9e61d6b 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4185,6 +4185,7 @@ out:
return rc;
err_out_msi_4:
+ netif_napi_del(&tp->napi);
rtl_disable_msi(pdev, tp);
iounmap(ioaddr);
err_out_free_res_3:
@@ -4210,6 +4211,8 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev)
cancel_delayed_work_sync(&tp->task);
+ netif_napi_del(&tp->napi);
+
unregister_netdev(dev);
rtl_release_firmware(tp);
--
1.7.7.6
@@ -0,0 +1,112 @@
From 4ffefa650ebbe2ef8bc2babff2e5686c33a2dab3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Jun 2012 23:50:04 +0000
Subject: [PATCH 088/109] bnx2x: fix checksum validation
commit d6cb3e41386f20fb0777d0b59a2def82c65d37f7 upstream.
bnx2x driver incorrectly sets ip_summed to CHECKSUM_UNNECESSARY on
encapsulated segments. TCP stack happily accepts frames with bad
checksums, if they are inside a GRE or IPIP encapsulation.
Our understanding is that if no IP or L4 csum validation was done by the
hardware, we should leave ip_summed as is (CHECKSUM_NONE), since
hardware doesn't provide CHECKSUM_COMPLETE support in its cqe.
Then, if IP/L4 checksumming was done by the hardware, set
CHECKSUM_UNNECESSARY if no error was flagged.
Patch based on findings and analysis from Robert Evans
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Eilon Greenstein <eilong@broadcom.com>
Cc: Yaniv Rosner <yanivr@broadcom.com>
Cc: Merav Sicron <meravs@broadcom.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Robert Evans <evansr@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Eilon Greenstein <eilong@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backported to 3.2: adjust context, indentation]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 15 ------------
drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 28 ++++++++++++++++++-----
2 files changed, 22 insertions(+), 21 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index aec7212..8dda46a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -723,21 +723,6 @@ struct bnx2x_fastpath {
#define ETH_RX_ERROR_FALGS ETH_FAST_PATH_RX_CQE_PHY_DECODE_ERR_FLG
-#define BNX2X_IP_CSUM_ERR(cqe) \
- (!((cqe)->fast_path_cqe.status_flags & \
- ETH_FAST_PATH_RX_CQE_IP_XSUM_NO_VALIDATION_FLG) && \
- ((cqe)->fast_path_cqe.type_error_flags & \
- ETH_FAST_PATH_RX_CQE_IP_BAD_XSUM_FLG))
-
-#define BNX2X_L4_CSUM_ERR(cqe) \
- (!((cqe)->fast_path_cqe.status_flags & \
- ETH_FAST_PATH_RX_CQE_L4_XSUM_NO_VALIDATION_FLG) && \
- ((cqe)->fast_path_cqe.type_error_flags & \
- ETH_FAST_PATH_RX_CQE_L4_BAD_XSUM_FLG))
-
-#define BNX2X_RX_CSUM_OK(cqe) \
- (!(BNX2X_L4_CSUM_ERR(cqe) || BNX2X_IP_CSUM_ERR(cqe)))
-
#define BNX2X_PRS_FLAG_OVERETH_IPV4(flags) \
(((le16_to_cpu(flags) & \
PARSING_FLAGS_OVER_ETHERNET_PROTOCOL) >> \
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 580b44e..27d6d6c 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -551,6 +551,26 @@ static inline void bnx2x_set_skb_rxhash(struct bnx2x *bp, union eth_rx_cqe *cqe,
le32_to_cpu(cqe->fast_path_cqe.rss_hash_result);
}
+static void bnx2x_csum_validate(struct sk_buff *skb, union eth_rx_cqe *cqe,
+ struct bnx2x_fastpath *fp)
+{
+ /* Do nothing if no IP/L4 csum validation was done */
+
+ if (cqe->fast_path_cqe.status_flags &
+ (ETH_FAST_PATH_RX_CQE_IP_XSUM_NO_VALIDATION_FLG |
+ ETH_FAST_PATH_RX_CQE_L4_XSUM_NO_VALIDATION_FLG))
+ return;
+
+ /* If both IP/L4 validation were done, check if an error was found. */
+
+ if (cqe->fast_path_cqe.type_error_flags &
+ (ETH_FAST_PATH_RX_CQE_IP_BAD_XSUM_FLG |
+ ETH_FAST_PATH_RX_CQE_L4_BAD_XSUM_FLG))
+ fp->eth_q_stats.hw_csum_err++;
+ else
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+}
+
int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
{
struct bnx2x *bp = fp->bp;
@@ -746,13 +766,9 @@ reuse_rx:
skb_checksum_none_assert(skb);
- if (bp->dev->features & NETIF_F_RXCSUM) {
+ if (bp->dev->features & NETIF_F_RXCSUM)
+ bnx2x_csum_validate(skb, cqe, fp);
- if (likely(BNX2X_RX_CSUM_OK(cqe)))
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- else
- fp->eth_q_stats.hw_csum_err++;
- }
}
skb_record_rx_queue(skb, fp->index);
--
1.7.7.6
@@ -0,0 +1,75 @@
From 9a59f534e5f1d432bf63f0ed6cb184b1ce988063 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 13 Jun 2012 09:45:16 +0000
Subject: [PATCH 089/109] bnx2x: fix panic when TX ring is full
commit bc14786a100cc6a81cd060e8031ec481241b418c upstream.
There is a off by one error in the minimal number of BD in
bnx2x_start_xmit() and bnx2x_tx_int() before stopping/resuming tx queue.
A full size GSO packet, with data included in skb->head really needs
(MAX_SKB_FRAGS + 4) BDs, because of bnx2x_tx_split()
This error triggers if BQL is disabled and heavy TCP transmit traffic
occurs.
bnx2x_tx_split() definitely can be called, remove a wrong comment.
Reported-by: Tomas Hruby <thruby@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Eilon Greenstein <eilong@broadcom.com>
Cc: Yaniv Rosner <yanivr@broadcom.com>
Cc: Merav Sicron <meravs@broadcom.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Robert Evans <evansr@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 8 +++-----
1 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 27d6d6c..2c1a5c0 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -220,7 +220,7 @@ int bnx2x_tx_int(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata)
if ((netif_tx_queue_stopped(txq)) &&
(bp->state == BNX2X_STATE_OPEN) &&
- (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 3))
+ (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 4))
netif_tx_wake_queue(txq);
__netif_tx_unlock(txq);
@@ -2254,8 +2254,6 @@ int bnx2x_poll(struct napi_struct *napi, int budget)
/* we split the first BD into headers and data BDs
* to ease the pain of our fellow microcode engineers
* we use one mapping for both BDs
- * So far this has only been observed to happen
- * in Other Operating Systems(TM)
*/
static noinline u16 bnx2x_tx_split(struct bnx2x *bp,
struct bnx2x_fp_txdata *txdata,
@@ -2906,7 +2904,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
txdata->tx_bd_prod += nbd;
- if (unlikely(bnx2x_tx_avail(bp, txdata) < MAX_SKB_FRAGS + 3)) {
+ if (unlikely(bnx2x_tx_avail(bp, txdata) < MAX_SKB_FRAGS + 4)) {
netif_tx_stop_queue(txq);
/* paired memory barrier is in bnx2x_tx_int(), we have to keep
@@ -2915,7 +2913,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
smp_mb();
fp->eth_q_stats.driver_xoff++;
- if (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 3)
+ if (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 4)
netif_tx_wake_queue(txq);
}
txdata->tx_pkt++;
--
1.7.7.6
@@ -0,0 +1,135 @@
From 401354ebe4d79d6edf536ad7b69e59afeec81308 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 14 Jun 2012 06:42:44 +0000
Subject: [PATCH 090/109] net: remove skb_orphan_try()
commit 62b1a8ab9b3660bb820d8dfe23148ed6cda38574 upstream.
Orphaning skb in dev_hard_start_xmit() makes bonding behavior
unfriendly for applications sending big UDP bursts : Once packets
pass the bonding device and come to real device, they might hit a full
qdisc and be dropped. Without orphaning, the sender is automatically
throttled because sk->sk_wmemalloc reaches sk->sk_sndbuf (assuming
sk_sndbuf is not too big)
We could try to defer the orphaning adding another test in
dev_hard_start_xmit(), but all this seems of little gain,
now that BQL tends to make packets more likely to be parked
in Qdisc queues instead of NIC TX ring, in cases where performance
matters.
Reverts commits :
fc6055a5ba31 net: Introduce skb_orphan_try()
87fd308cfc6b net: skb_tx_hash() fix relative to skb_orphan_try()
and removes SKBTX_DRV_NEEDS_SK_REF flag
Reported-and-bisected-by: Jean-Michel Hautbois <jhautbois@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Oliver Hartkopp <socketcan@hartkopp.net>
Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backported to 3.2:
- Adjust context
- SKBTX_WIFI_STATUS is not defined]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
include/linux/skbuff.h | 5 +----
net/can/raw.c | 3 ---
net/core/dev.c | 23 +----------------------
net/iucv/af_iucv.c | 1 -
4 files changed, 2 insertions(+), 30 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bdb4590..53dc7e7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -213,11 +213,8 @@ enum {
/* device driver is going to provide hardware time stamp */
SKBTX_IN_PROGRESS = 1 << 2,
- /* ensure the originating sk reference is available on driver level */
- SKBTX_DRV_NEEDS_SK_REF = 1 << 3,
-
/* device driver supports TX zero-copy buffers */
- SKBTX_DEV_ZEROCOPY = 1 << 4,
+ SKBTX_DEV_ZEROCOPY = 1 << 3,
};
/*
diff --git a/net/can/raw.c b/net/can/raw.c
index cde1b4a..46cca3a 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -681,9 +681,6 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock,
if (err < 0)
goto free_skb;
- /* to be able to check the received tx sock reference in raw_rcv() */
- skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF;
-
skb->dev = dev;
skb->sk = sk;
diff --git a/net/core/dev.c b/net/core/dev.c
index 1cbddc9..5738654 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2079,25 +2079,6 @@ static int dev_gso_segment(struct sk_buff *skb, int features)
return 0;
}
-/*
- * Try to orphan skb early, right before transmission by the device.
- * We cannot orphan skb if tx timestamp is requested or the sk-reference
- * is needed on driver level for other reasons, e.g. see net/can/raw.c
- */
-static inline void skb_orphan_try(struct sk_buff *skb)
-{
- struct sock *sk = skb->sk;
-
- if (sk && !skb_shinfo(skb)->tx_flags) {
- /* skb_tx_hash() wont be able to get sk.
- * We copy sk_hash into skb->rxhash
- */
- if (!skb->rxhash)
- skb->rxhash = sk->sk_hash;
- skb_orphan(skb);
- }
-}
-
static bool can_checksum_protocol(unsigned long features, __be16 protocol)
{
return ((features & NETIF_F_GEN_CSUM) ||
@@ -2182,8 +2163,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
if (!list_empty(&ptype_all))
dev_queue_xmit_nit(skb, dev);
- skb_orphan_try(skb);
-
features = netif_skb_features(skb);
if (vlan_tx_tag_present(skb) &&
@@ -2293,7 +2272,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
if (skb->sk && skb->sk->sk_hash)
hash = skb->sk->sk_hash;
else
- hash = (__force u16) skb->protocol ^ skb->rxhash;
+ hash = (__force u16) skb->protocol;
hash = jhash_1word(hash, hashrnd);
return (u16) (((u64) hash * qcount) >> 32) + qoffset;
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 274d150..cf98d62 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -380,7 +380,6 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
skb_trim(skb, skb->dev->mtu);
}
skb->protocol = ETH_P_AF_IUCV;
- skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF;
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
--
1.7.7.6
@@ -0,0 +1,78 @@
From 16fe1810332abe3998fa2e0760af7f8ca5f701d6 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Mon, 4 Jun 2012 15:00:04 +0800
Subject: [PATCH 091/109] ACPI: Make acpi_skip_timer_override cover all
source_irq==0 cases
commit ae10ccdc3093486f8c2369d227583f9d79f628e5 upstream.
Currently when acpi_skip_timer_override is set, it only cover the
(source_irq == 0 && global_irq == 2) cases. While there is also
platform which need use this option and its global_irq is not 2.
This patch will extend acpi_skip_timer_override to cover all
timer overriding cases as long as the source irq is 0.
This is the first part of a fix to kernel bug bugzilla 40002:
"IRQ 0 assigned to VGA"
https://bugzilla.kernel.org/show_bug.cgi?id=40002
Reported-and-tested-by: Szymon Kowalczyk <fazerxlo@o2.pl>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/x86/kernel/acpi/boot.c | 14 ++++++++------
1 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4558f0d..a94dc95 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -416,12 +416,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
return 0;
}
- if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
+ if (intsrc->source_irq == 0) {
if (acpi_skip_timer_override) {
- printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
+ printk(PREFIX "BIOS IRQ0 override ignored.\n");
return 0;
}
- if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+
+ if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
+ && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
}
@@ -1327,7 +1329,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
}
/*
- * Force ignoring BIOS IRQ0 pin2 override
+ * Force ignoring BIOS IRQ0 override
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
{
@@ -1337,7 +1339,7 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
*/
if (!acpi_skip_timer_override) {
WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
- pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+ pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
d->ident);
acpi_skip_timer_override = 1;
}
@@ -1431,7 +1433,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
* is enabled. This input is incorrectly designated the
* ISA IRQ 0 via an interrupt source override even though
* it is wired to the output of the master 8259A and INTIN0
- * is not connected at all. Force ignoring BIOS IRQ0 pin2
+ * is not connected at all. Force ignoring BIOS IRQ0
* override in that cases.
*/
{
--
1.7.7.6
@@ -0,0 +1,42 @@
From b9c36e346bdb1c7bd2edd7489561f94155195e6e Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Mon, 4 Jun 2012 15:00:05 +0800
Subject: [PATCH 092/109] ACPI: Remove one board specific WARN when ignoring
timer overriding
commit 5752cdb805ff89942d99d12118e2844e7db34df8 upstream.
commit 7f68b4c2e158019c2ec494b5cfbd9c83b4e5b253 upstream.
Current WARN msg is only for the ati_ixp4x0 board, while this function
is used by mulitple platforms. So this one board specific warning
is not appropriate any more.
Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/x86/kernel/acpi/boot.c | 5 -----
1 files changed, 0 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a94dc95..882960e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1333,12 +1333,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
{
- /*
- * The ati_ixp4x0_rev() early PCI quirk should have set
- * the acpi_skip_timer_override flag already:
- */
if (!acpi_skip_timer_override) {
- WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
d->ident);
acpi_skip_timer_override = 1;
--
1.7.7.6
@@ -0,0 +1,50 @@
From 0430a8402ee6125c909ae3f4c3a89696e9a24077 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Mon, 4 Jun 2012 15:00:06 +0800
Subject: [PATCH 093/109] ACPI: Add a quirk for "AMILO PRO V2030" to ignore
the timer overriding
commit b939c2acf1dc42b08407ef5174f2e8d6f43dd5ea upstream.
commit f6b54f083cc66cf9b11d2120d8df3c2ad4e0836d upstream.
This is the 2nd part of fix for kernel bugzilla 40002:
"IRQ 0 assigned to VGA"
https://bugzilla.kernel.org/show_bug.cgi?id=40002
The root cause is the buggy FW, whose ACPI tables assign the GSI 16
to 2 irqs 0 and 16(VGA), and the VGA is the right owner of GSI 16.
So add a quirk to ignore the irq0 overriding GSI 16 for the
FUJITSU SIEMENS AMILO PRO V2030 platform will solve this issue.
Reported-and-tested-by: Szymon Kowalczyk <fazerxlo@o2.pl>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/x86/kernel/acpi/boot.c | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 882960e..479d03c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1463,6 +1463,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
},
},
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "FUJITSU SIEMENS",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
+ },
+ },
{}
};
--
1.7.7.6
@@ -0,0 +1,43 @@
From 66b7502a0d37876e547c5440aa34bee18e3b0f1e Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 20 Feb 2012 14:20:06 +0800
Subject: [PATCH 094/109] ACPI, x86: fix Dell M6600 ACPI reboot regression via
DMI
commit 76eb9a30db4bc8fd172f9155247264b5f2686d7b upstream.
Dell Precision M6600 is known to require PCI reboot, so add it to
the reboot blacklist in pci_reboot_dmi_table[].
https://bugzilla.kernel.org/show_bug.cgi?id=42749
cc: x86@kernel.org
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/x86/kernel/reboot.c | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 37a458b..e61f79c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -460,6 +460,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
},
},
+ { /* Handle problems with rebooting on the Precision M6600. */
+ .callback = set_pci_reboot,
+ .ident = "Dell OptiPlex 990",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+ },
+ },
{ }
};
--
1.7.7.6
@@ -0,0 +1,43 @@
From b563da2fc80658815355acc804d1b8c21d1a88f9 Mon Sep 17 00:00:00 2001
From: Pavel Vasilyev <pavel@pavlinux.ru>
Date: Tue, 5 Jun 2012 00:02:05 -0400
Subject: [PATCH 095/109] ACPI sysfs.c strlen fix
commit 9f132652d94c96476b0b0a8caf0c10e96ab10fa8 upstream.
Current code is ignoring the last character of "enable" and "disable"
in comparisons.
https://bugzilla.kernel.org/show_bug.cgi?id=33732
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/acpi/sysfs.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c
index 9f66181..240a244 100644
--- a/drivers/acpi/sysfs.c
+++ b/drivers/acpi/sysfs.c
@@ -173,7 +173,7 @@ static int param_set_trace_state(const char *val, struct kernel_param *kp)
{
int result = 0;
- if (!strncmp(val, "enable", strlen("enable") - 1)) {
+ if (!strncmp(val, "enable", strlen("enable"))) {
result = acpi_debug_trace(trace_method_name, trace_debug_level,
trace_debug_layer, 0);
if (result)
@@ -181,7 +181,7 @@ static int param_set_trace_state(const char *val, struct kernel_param *kp)
goto exit;
}
- if (!strncmp(val, "disable", strlen("disable") - 1)) {
+ if (!strncmp(val, "disable", strlen("disable"))) {
int name = 0;
result = acpi_debug_trace((char *)&name, trace_debug_level,
trace_debug_layer, 0);
--
1.7.7.6
@@ -0,0 +1,98 @@
From 5daf178c74f17e523291b0c4eabbf3b3f3740b75 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Mon, 11 Jun 2012 09:24:11 -0700
Subject: [PATCH 096/109] eCryptfs: Gracefully refuse miscdev file ops on
inherited/passed files
commit 8dc6780587c99286c0d3de747a2946a76989414a upstream.
File operations on /dev/ecryptfs would BUG() when the operations were
performed by processes other than the process that originally opened the
file. This could happen with open files inherited after fork() or file
descriptors passed through IPC mechanisms. Rather than calling BUG(), an
error code can be safely returned in most situations.
In ecryptfs_miscdev_release(), eCryptfs still needs to handle the
release even if the last file reference is being held by a process that
didn't originally open the file. ecryptfs_find_daemon_by_euid() will not
be successful, so a pointer to the daemon is stored in the file's
private_data. The private_data pointer is initialized when the miscdev
file is opened and only used when the file is released.
https://launchpad.net/bugs/994247
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reported-by: Sasha Levin <levinsasha928@gmail.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/ecryptfs/miscdev.c | 23 ++++++++++++++++-------
1 files changed, 16 insertions(+), 7 deletions(-)
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 0dc5a3d..a050e4b 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -49,7 +49,10 @@ ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
mutex_lock(&ecryptfs_daemon_hash_mux);
/* TODO: Just use file->private_data? */
rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
- BUG_ON(rc || !daemon);
+ if (rc || !daemon) {
+ mutex_unlock(&ecryptfs_daemon_hash_mux);
+ return -EINVAL;
+ }
mutex_lock(&daemon->mux);
mutex_unlock(&ecryptfs_daemon_hash_mux);
if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
@@ -122,6 +125,7 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
goto out_unlock_daemon;
}
daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN;
+ file->private_data = daemon;
atomic_inc(&ecryptfs_num_miscdev_opens);
out_unlock_daemon:
mutex_unlock(&daemon->mux);
@@ -152,9 +156,9 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file)
mutex_lock(&ecryptfs_daemon_hash_mux);
rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
- BUG_ON(rc || !daemon);
+ if (rc || !daemon)
+ daemon = file->private_data;
mutex_lock(&daemon->mux);
- BUG_ON(daemon->pid != task_pid(current));
BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN));
daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN;
atomic_dec(&ecryptfs_num_miscdev_opens);
@@ -246,8 +250,16 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
mutex_lock(&ecryptfs_daemon_hash_mux);
/* TODO: Just use file->private_data? */
rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
- BUG_ON(rc || !daemon);
+ if (rc || !daemon) {
+ mutex_unlock(&ecryptfs_daemon_hash_mux);
+ return -EINVAL;
+ }
mutex_lock(&daemon->mux);
+ if (task_pid(current) != daemon->pid) {
+ mutex_unlock(&daemon->mux);
+ mutex_unlock(&ecryptfs_daemon_hash_mux);
+ return -EPERM;
+ }
if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
rc = 0;
mutex_unlock(&ecryptfs_daemon_hash_mux);
@@ -284,9 +296,6 @@ check_list:
* message from the queue; try again */
goto check_list;
}
- BUG_ON(euid != daemon->euid);
- BUG_ON(current_user_ns() != daemon->user_ns);
- BUG_ON(task_pid(current) != daemon->pid);
msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue,
struct ecryptfs_msg_ctx, daemon_out_list);
BUG_ON(!msg_ctx);
--
1.7.7.6
@@ -0,0 +1,105 @@
From 3b0dfe936fb38efde98e2650ff18587c3285eb2a Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Mon, 11 Jun 2012 10:21:34 -0700
Subject: [PATCH 097/109] eCryptfs: Fix lockdep warning in miscdev operations
commit 60d65f1f07a7d81d3eb3b91fc13fca80f2fdbb12 upstream.
Don't grab the daemon mutex while holding the message context mutex.
Addresses this lockdep warning:
ecryptfsd/2141 is trying to acquire lock:
(&ecryptfs_msg_ctx_arr[i].mux){+.+.+.}, at: [<ffffffffa029c213>] ecryptfs_miscdev_read+0x143/0x470 [ecryptfs]
but task is already holding lock:
(&(*daemon)->mux){+.+...}, at: [<ffffffffa029c2ec>] ecryptfs_miscdev_read+0x21c/0x470 [ecryptfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&(*daemon)->mux){+.+...}:
[<ffffffff810a3b8d>] lock_acquire+0x9d/0x220
[<ffffffff8151c6da>] __mutex_lock_common+0x5a/0x4b0
[<ffffffff8151cc64>] mutex_lock_nested+0x44/0x50
[<ffffffffa029c5d7>] ecryptfs_send_miscdev+0x97/0x120 [ecryptfs]
[<ffffffffa029b744>] ecryptfs_send_message+0x134/0x1e0 [ecryptfs]
[<ffffffffa029a24e>] ecryptfs_generate_key_packet_set+0x2fe/0xa80 [ecryptfs]
[<ffffffffa02960f8>] ecryptfs_write_metadata+0x108/0x250 [ecryptfs]
[<ffffffffa0290f80>] ecryptfs_create+0x130/0x250 [ecryptfs]
[<ffffffff811963a4>] vfs_create+0xb4/0x120
[<ffffffff81197865>] do_last+0x8c5/0xa10
[<ffffffff811998f9>] path_openat+0xd9/0x460
[<ffffffff81199da2>] do_filp_open+0x42/0xa0
[<ffffffff81187998>] do_sys_open+0xf8/0x1d0
[<ffffffff81187a91>] sys_open+0x21/0x30
[<ffffffff81527d69>] system_call_fastpath+0x16/0x1b
-> #0 (&ecryptfs_msg_ctx_arr[i].mux){+.+.+.}:
[<ffffffff810a3418>] __lock_acquire+0x1bf8/0x1c50
[<ffffffff810a3b8d>] lock_acquire+0x9d/0x220
[<ffffffff8151c6da>] __mutex_lock_common+0x5a/0x4b0
[<ffffffff8151cc64>] mutex_lock_nested+0x44/0x50
[<ffffffffa029c213>] ecryptfs_miscdev_read+0x143/0x470 [ecryptfs]
[<ffffffff811887d3>] vfs_read+0xb3/0x180
[<ffffffff811888ed>] sys_read+0x4d/0x90
[<ffffffff81527d69>] system_call_fastpath+0x16/0x1b
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/ecryptfs/miscdev.c | 25 +++++++++++++------------
1 files changed, 13 insertions(+), 12 deletions(-)
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index a050e4b..de42310 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -195,31 +195,32 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
u16 msg_flags, struct ecryptfs_daemon *daemon)
{
- int rc = 0;
+ struct ecryptfs_message *msg;
- mutex_lock(&msg_ctx->mux);
- msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
- GFP_KERNEL);
- if (!msg_ctx->msg) {
- rc = -ENOMEM;
+ msg = kmalloc((sizeof(*msg) + data_size), GFP_KERNEL);
+ if (!msg) {
printk(KERN_ERR "%s: Out of memory whilst attempting "
"to kmalloc(%zd, GFP_KERNEL)\n", __func__,
- (sizeof(*msg_ctx->msg) + data_size));
- goto out_unlock;
+ (sizeof(*msg) + data_size));
+ return -ENOMEM;
}
+
+ mutex_lock(&msg_ctx->mux);
+ msg_ctx->msg = msg;
msg_ctx->msg->index = msg_ctx->index;
msg_ctx->msg->data_len = data_size;
msg_ctx->type = msg_type;
memcpy(msg_ctx->msg->data, data, data_size);
msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
- mutex_lock(&daemon->mux);
list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue);
+ mutex_unlock(&msg_ctx->mux);
+
+ mutex_lock(&daemon->mux);
daemon->num_queued_msg_ctx++;
wake_up_interruptible(&daemon->wait);
mutex_unlock(&daemon->mux);
-out_unlock:
- mutex_unlock(&msg_ctx->mux);
- return rc;
+
+ return 0;
}
/**
--
1.7.7.6
@@ -0,0 +1,45 @@
From 590d0b9de4bb4ef7a84bb0a8a13d85353556e7ae Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Tue, 12 Jun 2012 11:17:01 -0700
Subject: [PATCH 098/109] eCryptfs: Properly check for O_RDONLY flag before
doing privileged open
commit 9fe79d7600497ed8a95c3981cbe5b73ab98222f0 upstream.
If the first attempt at opening the lower file read/write fails,
eCryptfs will retry using a privileged kthread. However, the privileged
retry should not happen if the lower file's inode is read-only because a
read/write open will still be unsuccessful.
The check for determining if the open should be retried was intended to
be based on the access mode of the lower file's open flags being
O_RDONLY, but the check was incorrectly performed. This would cause the
open to be retried by the privileged kthread, resulting in a second
failed open of the lower file. This patch corrects the check to
determine if the open request should be handled by the privileged
kthread.
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
fs/ecryptfs/kthread.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index 69f994a..0dbe58a 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -149,7 +149,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
(*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred);
if (!IS_ERR(*lower_file))
goto out;
- if (flags & O_RDONLY) {
+ if ((flags & O_ACCMODE) == O_RDONLY) {
rc = PTR_ERR((*lower_file));
goto out;
}
--
1.7.7.6
@@ -0,0 +1,49 @@
From 074935f3d2b0e862c66af5032619659b070e1ebb Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 29 May 2012 21:21:07 +0200
Subject: [PATCH 099/109] ACPI / PM: Make acpi_pm_device_sleep_state() follow
the specification
commit dbe9a2edd17d843d80faf2b99f20a691c1853418 upstream.
The comparison between the system sleep state being entered
and the lowest system sleep state the given device may wake up
from in acpi_pm_device_sleep_state() is reversed, because the
specification (ACPI 5.0) says that for wakeup to work:
"The sleeping state being entered must be less than or equal to the
power state declared in element 1 of the _PRW object."
In other words, the state returned by _PRW is the deepest
(lowest-power) system sleep state the device is capable of waking up
the system from.
Moreover, acpi_pm_device_sleep_state() also should check if the
wakeup capability is supported through ACPI, because in principle it
may be done via native PCIe PME, for example, in which case _SxW
should not be evaluated.
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
drivers/acpi/sleep.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index ca191ff..ed6bc52 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -702,8 +702,8 @@ int acpi_pm_device_sleep_state(struct device *dev, int *d_min_p)
* can wake the system. _S0W may be valid, too.
*/
if (acpi_target_sleep_state == ACPI_STATE_S0 ||
- (device_may_wakeup(dev) &&
- adev->wakeup.sleep_state <= acpi_target_sleep_state)) {
+ (device_may_wakeup(dev) && adev->wakeup.flags.valid &&
+ adev->wakeup.sleep_state >= acpi_target_sleep_state)) {
acpi_status status;
acpi_method[3] = 'W';
--
1.7.7.6

Some files were not shown because too many files have changed in this diff Show More