[Gluster-devel] quota.t hangs on NetBSD machines

Raghavendra Talur rtalur at redhat.com
Thu Dec 31 10:10:54 UTC 2015


On Thu, Dec 31, 2015 at 3:24 PM, Emmanuel Dreyfus <manu at netbsd.org> wrote:

> On Thu, Dec 31, 2015 at 02:51:41PM +0530, Raghavendra Talur wrote:
> > To our surprise though, the hung test started proceeding.
>
> You mean a process gets stuck into a system call for hours and then
> is able to ascape?
>
> Some hints:
>
> 1) ps -axl shows the waiting channel (WCHAN column) for a process stuck
> in kernel. What is it?
>

# ps -axl | grep 23268
   0 23268     1     0  85  0  56436 12544 select   Isl  ?        0:02.06
glusterfs --attribute-timeout=0 --entry-timeout=0 -s nbsla
# ps -axl | grep 26515
   0 26515     1     0  85  0   4508  1440 kqueue   S+   pts/0    0:00.04
perfused: perfused /mnt/glusterfs/0 (perfused)



>
> 2) crash is a kernel debugger that can be used while running multiuser.
> Of course since the system is running, the output is obsolete most
> of the time, but for a stuck process we can extract valuable information.
>
> Run crash from the shell, then inside crash, run the ps command. Find the
> relevant process and note the address in the STRUCT LWP * column. For an
> example, let us say it is c63452a0.
>

relevant lines:

23268    8 3   0        80           c4c9f000          glusterfsd parked
23268    7 3   0        80           c5223a80         glusterfsd netio
23268    6 3   0        80           c542e560         glusterfsd nanoslp
23268    5 3   1        80           c5229a80         glusterfsd parked
23268    4 3   0        80           c5418d40         glusterfsd parked
23268    3 3   0        80           c5346540         glusterfsd sigwait
23268    2 3   0        80           c4ce22c0         glusterfsd nanoslp
23268    1 3   1        80           c5418020         glusterfsd select
26515    1 3   1        80           c53692c0         perfused kqueue



>
> bt/a c63452a0 will produce a kernel backtrace for the process. This can
> be extremely valuable to understand hat is going on. If we are awaiting
> for a lock, we can track what process is holdoing it.
>

bt/a c4c9f000
trace: pid 23268 lid 8 at 0xdc171e9c
sleepq_block(0,1,c047f728,c049a1bc,6,5000a018,8,dc171f54,c4c9f000,6) at
sleepq_block+0x9b
lwp_park(0,1,0,bb1ac150,1de,dc171f54,6,dc171f40,c4c9f000,c0494528) at
lwp_park+0x115
sys____lwp_park60(c4c9f000,dc171f54,dc171f7c,dc171fa0,c02eabb7,dc171f54,1de,103,0,1)
at sys____lwp_park60+0x50
syscall() at syscall+0x89
--- syscall (number 478) ---
bb3bb4b7:

bt/a c5223a80
trace: pid 23268 lid 7 at 0xdb781d0c
sleepq_block(0,1,c0482887,c0495450,c5223a80,6473,c2d42d40,c2d41dc0,c2d29f82,0)
at sleepq_block+0x9b
cv_timedwait_sig(c4dabf2c,c386c3c0,0,c040eb9f,c50a1001,c4eb2f47,65686ee0,c4dabe44,c4dabe44,0)
at cv_timedwait_sig+0xaa
sbwait(c4dabf00,0,db781dbc,c01215cc,0,c50a1080,db781dcc,140eb9f,0,c049a760)
at sbwait+0x57
soreceive(c4dabe44,0,db781ec8,0,0,0,db781e5c,c040eb9f,2,2) at
soreceive+0xc59
soo_read(c4c932c0,c4c932c0,db781ec8,c4c79000,1,c2d29f80,db781e8c,c02552a5,0,0)
at soo_read+0x3c
do_filereadv(a,b88fff8c,2,c4c932c0,1,db781f7c,3,c5223a80,c5223a80,c5223a80)
at do_filereadv+0x1f0
sys_readv(c5223a80,db781f54,db781f7c,db781fa0,c02eabb7,db781f54,78,db781f7c,a,b88fff8c)
at sys_readv+0x38
syscall() at syscall+0x18b
--- syscall (number 120) ---
bb351877:

bt/a c542e560
trace: pid 23268 lid 6 at 0xdc5edddc
sleepq_block(1f5,1,c047363e,c0495dc8,0,3ffff,0,c2d41440,c049cb00,1f5) at
sleepq_block+0xea
kpause(c047363e,1,1f5,0,dc5edea4,c4e5df80,ffffffff,ffffffff,c4da9c00,c3458bb0)
at kpause+0xe8
nanosleep1(c542e560,3,0,dc5edefc,dc5edf08,9,c38b7360,0,c542e560,c0492efc)
at nanosleep1+0xe5
sys___nanosleep50(c542e560,dc5edf54,dc5edf7c,c08eb880,c048ce80,dc5edf54,1ae,b9a0e7c0,b8fff730,b8fff73c)
at sys___nanosleep50+0x5f
syscall() at syscall+0x89
--- syscall (number 430) ---
bb351957:


bt/a c5229a80
trace: pid 23268 lid 5 at 0xdba17e9c
sleepq_block(ea60,1,c047f728,c049a1bc,0,64,dba17efc,c0371aeb,0,c5418d40) at
sleepq_block+0xea
lwp_park(0,1,dba17f18,ba40d1a4,0,257,0,3acd705f,c5229a80,c0494528) at
lwp_park+0x115
sys____lwp_park60(c5229a80,dba17f54,dba17f7c,dba17fa0,c02eab83,dba17f54,1de,103,0,1)
at sys____lwp_park60+0x50
syscall() at syscall+0x89
--- syscall (number 478) ---
bb3bb4b7:


 bt/a  c5418d40
trace: pid 23268 lid 4 at 0xdb691e9c
sleepq_block(ea60,1,c047f728,c049a1bc,0,db691fa8,db691eec,c0251425,db691ed4,6)
at sleepq_block+0xea
lwp_park(0,1,db691f18,ba40d1a4,0,257,0,3accfb61,c5418d40,c0494528) at
lwp_park+0x115
sys____lwp_park60(c5418d40,db691f54,db691f7c,db691fa0,c02eab83,db691f54,1de,103,0,1)
at sys____lwp_park60+0x50
syscall() at syscall+0x89
--- syscall (number 478) ---
bb3bb4b7:

 bt/a c5346540
trace: pid 23268 lid 3 at 0xdb727e1c
sleepq_block(0,1,c04716ba,c0495450,c040bc16,0,c2d42d40,c2d41400,c5346540,0)
at sleepq_block+0x9b
cv_timedwait_sig(c53466b4,c5004b80,0,c53466a4,3,db727e90,c53466a4,c41eb528,db727eac,7ff0)
at cv_timedwait_sig+0xaa
sigtimedwait1(c5346540,db727f54,db727f7c,c01026f0,c01026a0,c01026f0,c01026a0,c06d9000,c02ea954,c3469b10)
at sigtimedwait1+0x242
sys_____sigtimedwait50(c5346540,db727f54,db727f7c,db727fa0,c02eab83,db727f54,1af,103,ba1fefbc,0)
at sys_____sigtimedwait50+0x3f
syscall() at syscall+0x89
--- syscall (number 431) ---
bb39f8c7:



bt/a c4ce22c0
trace: pid 23268 lid 2 at 0xdcccdddc
sleepq_block(65,1,c047363e,c0495dc8,0,c4eb2f47,e1f297c1,c2d40540,c049c7dc,65)
at sleepq_block+0xea
kpause(c047363e,1,65,0,dcccdea4,dcccdec4,dcccdeac,c0251276,c048cce0,1) at
kpause+0xe8
nanosleep1(c4ce22c0,3,0,dcccdefc,0,c2d1f548,fffffffd,a5b55001,1b,dcccdfa8)
at nanosleep1+0xe5
sys___nanosleep50(c4ce22c0,dcccdf54,dcccdf7c,dcccdfa0,c02eab83,dcccdf54,1ae,103,ba3fff98,0)
at sys___nanosleep50+0x5f
syscall() at syscall+0x89
--- syscall (number 430) ---
bb351957:



 bt/a c5418020
trace: pid 23268 lid 1 at 0xdb721d0c
sleepq_block(2,1,c047fc22,c049a1f8,c56aae94,1,ffffffff,c040eb25,c048ce80,0)
at sleepq_block+0xea
sel_do_scan(30,db721f18,0,db721f7c,c2d42bc2,c4eb2f47,d5d7bdcf,c02369f7,3,3)
at sel_do_scan+0x46e
pollcommon(db721f7c,ba4143c0,6,db721f18,0,db721fa8,db721f2c,db721f40,c040eb9f,0)
at pollcommon+0xe7
sys_poll(c5418020,db721f54,db721f7c,db721fa0,c02eabb7,db721f54,d1,103,ba4143c0,6)
at sys_poll+0x6a
syscall() at syscall+0x89
--- syscall (number 209) ---
bb351917:

bt/a c53692c0
trace: pid 26515 lid 1 at 0xdbce3d2c
sleepq_block(0,1,c0471305,c0495450,c5066f80,1,c2d42c00,c2d42d80,1,0) at
sleepq_block+0x9b
cv_timedwait_sig(c38eff3c,c38eff10,0,c012f509,c048ce80,0,dbce3dbc,c040eb9f,c31a3560,c048ce80)
at cv_timedwait_sig+0xaa
kevent1(dbce3f7c,d,bb51f080,0,bb51f080,4,0,c044fbb0,c0492ef0,dbce3fa8) at
kevent1+0x45a
sys___kevent50(c53692c0,dbce3f54,dbce3f7c,dbce3fa0,c02eabb7,dbce3f54,1b3,103,d,bb51f080)
at sys___kevent50+0x45
syscall() at syscall+0x89
--- syscall (number 435) ---
bb679a77:




 Thanks for the help!


>
> --
> Emmanuel Dreyfus
> manu at netbsd.org
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.gluster.org/pipermail/gluster-devel/attachments/20151231/a0b07a09/attachment-0001.html>


More information about the Gluster-devel mailing list