[patch] serial console vs NMI watchdog

From: Andrew Morton (andrewm@uow.edu.au)
Date: Fri Mar 09 2001 - 09:21:25 EST

  • Next message: Hicks, Jamey: "RE: [linux-usb-devel] Re: SLAB vs. pci_alloc_xxx in usb-uhci patc h"

    SYSRQ-T on serial console can crash the machine. This
    is because a large amount of output is sent to a slow
    device while interrupts are disabled. The NMI
    watchdog triggers.

    The interrupt disabling happens in pc_keyb.c:keyboard_interrupt().
    Changing this code to *not* disable interrupts looks complex.

    I see two ways of fixing this. One is to do the sysrq
    stuff outside the spin_lock_irq(), with:

    static void keyboard_interrupt(int irq, void *dev_id, struct pt_regs *regs)
    {
    + extern void (*sysrq_handler)(void);
    + void (*my_sysrq_handler)(void);

            spin_lock_irq(&kbd_controller_lock);
            handle_kbd_event();
    + my_sysrq_handler = sysrq_handler;
    + sysrq_handler = 0;
            spin_unlock_irq(&kbd_controller_lock);
    + if (my_sysrq_handler)
    + (*my_sysrq_handler)();
    }

    But I didn't do that, because I suspect there are other
    places in the kernel (development and debug stuff) where
    we want to turn the NMI watchdog handler off for a while.

    So this patch creates a new API function

            enable_nmi_watchdog(int yes);

    and uses it within the sysrq code.

    BTW: NMI watchdog is now disabled by default in 2.4.3-pre3.
    The `nmi_watchdog=1' boot option is needed to enable it.

    --- linux-2.4.2-ac16/include/linux/irq.h Fri Mar 9 17:11:17 2001
    +++ linux-ac/include/linux/irq.h Sat Mar 10 01:02:12 2001
    @@ -56,6 +56,20 @@
     
     #include <asm/hw_irq.h> /* the arch dependent stuff */
     
    +/**
    + * enable_nmi_watchdog - enables/disables NMI watchdog checking.
    + * @yes: If zero, disable
    + *
    + * If the architecture supports the NMI watchdog, enable_nmi_watchdog() may be used
    + * to temporarily disable it. Calls to enable_nmi_watchdog() may be nested - it is
    + * implemented as an up/down counter, so the calls must be balanced.
    + */
    +#ifdef ARCH_HAS_NMI_WATCHDOG
    +extern void enable_nmi_watchdog(int yes);
    +#else
    +#define enable_nmi_watchdog(yes) do{} while(0)
    +#endif
    +
     extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
     extern int setup_irq(unsigned int , struct irqaction * );
     
    --- linux-2.4.2-ac16/include/asm-i386/irq.h Fri Oct 8 03:17:09 1999
    +++ linux-ac/include/asm-i386/irq.h Fri Mar 9 22:59:15 2001
    @@ -32,4 +32,8 @@
     extern void disable_irq_nosync(unsigned int);
     extern void enable_irq(unsigned int);
     
    +#ifdef CONFIG_X86_LOCAL_APIC
    +#define ARCH_HAS_NMI_WATCHDOG /* See include/linux/irq.h */
    +#endif
    +
     #endif /* _ASM_IRQ_H */
    --- linux-2.4.2-ac16/drivers/char/sysrq.c Sun Feb 25 17:37:04 2001
    +++ linux-ac/drivers/char/sysrq.c Fri Mar 9 23:00:39 2001
    @@ -23,6 +23,7 @@
     #include <linux/quotaops.h>
     #include <linux/smp_lock.h>
     #include <linux/module.h>
    +#include <linux/irq.h>
     
     #include <asm/ptrace.h>
     
    @@ -69,6 +70,11 @@
             if (!key)
                     return;
     
    + /*
    + * Interrupts are disabled, and serial consoles are slow. So
    + * Let's suspend the NMI watchdog.
    + */
    + enable_nmi_watchdog(0);
             console_loglevel = 7;
             printk(KERN_INFO "SysRq: ");
             switch (key) {
    @@ -152,6 +158,7 @@
                     /* Don't use 'A' as it's handled specially on the Sparc */
             }
     
    + enable_nmi_watchdog(1);
             console_loglevel = orig_log_level;
     }
     
    --- linux-2.4.2-ac16/arch/i386/kernel/nmi.c Fri Mar 9 17:10:51 2001
    +++ linux-ac/arch/i386/kernel/nmi.c Sat Mar 10 01:10:50 2001
    @@ -226,6 +226,15 @@
     }
     
     static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
    +static atomic_t nmi_watchdog_enabled = ATOMIC_INIT(0); /* 0 == enabled */
    +
    +void enable_nmi_watchdog(int yes)
    +{
    + if (yes)
    + atomic_inc(&nmi_watchdog_enabled);
    + else
    + atomic_dec(&nmi_watchdog_enabled);
    +}
     
     void nmi_watchdog_tick (struct pt_regs * regs)
     {
    @@ -255,7 +264,7 @@
     
             sum = apic_timer_irqs[cpu];
     
    - if (last_irq_sums[cpu] == sum) {
    + if (last_irq_sums[cpu] == sum && atomic_read(&nmi_watchdog_enabled) == 0) {
                     /*
                      * Ayiee, looks like this CPU is stuck ...
                      * wait a few IRQs (5 seconds) before doing the oops ...
    -
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/



    This archive was generated by hypermail 2b29 : Fri Mar 09 2001 - 09:34:19 EST