re: (FOUND - Need patch) bad bad unknown brokeness

From: david (david@killerlabs.com)
Date: Mon Mar 20 2000 - 22:25:39 EST

  • Next message: Gregory Hosler: "diff switches"

    ok, i sent another email, here's the output, i managed to get a ps axwl run.
    note, it does NOT always hang under the same circumstances, but sending
    large bodied transactions has a high chance of triggering it.

    (ps outputs trimmed)

    James:~# ps axwl
      F UID PID PPID PRI NI VSZ RSS WCHAN STAT TTY TIME COMMAND
    000 0 120 1 8 0 1032 308 wait_o D ? 0:00 /usr/local/sbin/watchdogd
    000 501 716 198 16 0 5036 2432 nanosl S pts/1 0:02 pine
    000 501 4612 716 17 0 1472 784 wait_o D pts/1 0:00 joe /tmp/pico.000716

    the child proc of pine is joe, I was exiting joe which would flush the
    buffer to /tmp/pico.000716 then return to pine, pine would read that file.

    ok, I repeated the excercise to gain more information. this time joe
    managed to exit and pine started the smtp session and hung at the end. here
    is the ps awxl:

    nifty:/usr/src/linux-99p3# grep "wait_o" System.map
    c01232ec T ___wait_on_page
    c012e1cc T __wait_on_buffer
    c0131b8c T __wait_on_super
    c013e394 t __wait_on_inode
    c0152d9c t __wait_on_dquot
    c020b25f ? __kstrtab___wait_on_buffer
    c020b27a ? __kstrtab____wait_on_page
    c020c711 ? __kstrtab___wait_on_super
    c0210b80 ? __ksymtab___wait_on_buffer
    c0210b88 ? __ksymtab____wait_on_page
    c02111c8 ? __ksymtab___wait_on_super

    ok, let's try this excercise without quotas enabled.

    doesn't affect it.

    000 0 120 1 0 0 1032 308 12e345 D ? 0:00 /usr/local/sbin/watchdogd
    140 516 408 383 0 0 4292 2512 12e345 D ? 0:00 /usr/local/apache/bin/httpd
    040 8 416 414 12 0 2348 1576 15b7e5 D ? 0:15 sendmail: SAA00416 IDENT:mail@mail.l
    000 0 425 351 10 0 1376 656 12e345 D pts/2 0:00 wget -r www.kalifornia.com

    c012e1cc T __wait_on_buffer
    c012e3a4 t sync_buffers

    (unrelated i believe)
    c015b5a8 t __get_request_wait
    c015b848 T is_read_only

    Dump of assembler code for function __wait_on_buffer:
    [snipped to the point]
    0xc012e314 <__wait_on_buffer+328>: movl (%ebx),%ebx
    0xc012e316 <__wait_on_buffer+330>: lock addl $0x0,0x0(%esp,1)
    0xc012e31c <__wait_on_buffer+336>: movl $0x0,0x4(%eax)
    0xc012e323 <__wait_on_buffer+343>: testl %edx,%edx
    0xc012e325 <__wait_on_buffer+345>:
        je 0xc012e308 <__wait_on_buffer+316>
    0xc012e327 <__wait_on_buffer+347>: pushl %ecx
    0xc012e328 <__wait_on_buffer+348>: call *%edx
    0xc012e32a <__wait_on_buffer+350>: addl $0x4,%esp
    0xc012e32d <__wait_on_buffer+353>:
        jmp 0xc012e308 <__wait_on_buffer+316>
    0xc012e32f <__wait_on_buffer+355>: nop
    0xc012e330 <__wait_on_buffer+356>: movl 0x10(%esp,1),%edx
    0xc012e334 <__wait_on_buffer+360>: movl $0x2,(%edx)
    0xc012e33a <__wait_on_buffer+366>: testb $0x4,0x18(%ebp)
    0xc012e33e <__wait_on_buffer+370>:
        je 0xc012e34c <__wait_on_buffer+384>
    0xc012e340 <__wait_on_buffer+372>: call 0xc011450c <schedule>
    0xc012e345 <__wait_on_buffer+377>: testb $0x4,0x18(%ebp)
    0xc012e349 <__wait_on_buffer+381>:
        jne 0xc012e2e8 <__wait_on_buffer+284>
    0xc012e34b <__wait_on_buffer+383>: nop
    0xc012e34c <__wait_on_buffer+384>: movl 0x10(%esp,1),%ecx
    0xc012e350 <__wait_on_buffer+388>: movl $0x0,(%ecx)
    0xc012e356 <__wait_on_buffer+394>: pushf
    0xc012e357 <__wait_on_buffer+395>: popl %ebx
    0xc012e358 <__wait_on_buffer+396>: cli
    0xc012e359 <__wait_on_buffer+397>: movl 0x3c(%esp,1),%eax
    0xc012e35d <__wait_on_buffer+401>: cmpl %edi,%eax
    0xc012e35f <__wait_on_buffer+403>:
        je 0xc012e388 <__wait_on_buffer+444>
    0xc012e361 <__wait_on_buffer+405>: pushl %edi
    0xc012e362 <__wait_on_buffer+406>: pushl %eax
    0xc012e363 <__wait_on_buffer+407>: pushl $0xc01e2920
    0xc012e368 <__wait_on_buffer+412>: call 0xc0117170 <printk>

    linux/fs/buffer.c:134

      134 /*
      135 * Rewrote the wait-routines to use the "new" wait-queue functionality,
      136 * and getting rid of the cli-sti pairs. The wait-queue routines still
      137 * need cli-sti, but now it's just a couple of 386 instructions or so.
      138 *
      139 * Note that the real wait_on_buffer() is an inline function that checks
      140 * if 'b_wait' is set before calling this, so that the queues aren't set
      141 * up unnecessarily.
      142 */
      143 void __wait_on_buffer(struct buffer_head * bh)
      144 {
      145 struct task_struct *tsk = current;
      146 DECLARE_WAITQUEUE(wait, tsk);
      147
      148 atomic_inc(&bh->b_count);
      149 add_wait_queue(&bh->b_wait, &wait);
      150 do {
      151 run_task_queue(&tq_disk);
      152 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
      153 if (!buffer_locked(bh))
      154 break;
      155 schedule();
      156 } while (buffer_locked(bh));
      157 tsk->state = TASK_RUNNING;
      158 remove_wait_queue(&bh->b_wait, &wait);
      159 atomic_dec(&bh->b_count);
      160 }

    It looks like it's hanging on line 156.

    include/linux/fs.h:#define __buffer_state(bh, state) (((bh)->b_state & (1UL << BH_##state)) != 0)
    linux/include/linux/fs.h:#define buffer_locked(bh) __buffer_state(bh,Lock)

    I'm building a kernel now with a printk to show the b_state

    -d

    -
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.rutgers.edu
    Please read the FAQ at http://www.tux.org/lkml/



    This archive was generated by hypermail 2b29 : Tue Mar 21 2000 - 00:00:59 EST