[Gluster-devel] GlusterFS crash on on node reboot

Wed Jul 18 21:55:38 UTC 2007

My situation is that I have two nodes, both running client/server. If I
reboot one of the nodes, then attempt to do a stat (or any other action)
on the mounted volume glusterfsd locks up for a bit, and then crashes.
The debug output is as follows:

2007-07-18 16:15:35 C [common-utils.c:208:gf_print_trace]
debug-backtrace: Got signal (11), printing backtrace
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/libglusterfs.so.0(gf_print_trace+0x2d)
[0x2a890d]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: [0x18f420]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/libglusterfs.so.0(dict_ref+0x29)
[0x2a1f89]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/glusterfs/1.3.0-pre5.4/xlator/protocol/server.so [0xb04be8]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/glusterfs/1.3.0-pre5.4/xlator/protocol/server.so [0xb06ac2]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/glusterfs/1.3.0-pre5.4/xlator/performance/io-threads.so [0x8b4ae5]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/glusterfs/1.3.0-pre5.4/xlator/cluster/unify.so [0xbab62c]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/glusterfs/1.3.0-pre5.4/xlator/cluster/afr.so [0x4524c2]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/glusterfs/1.3.0-pre5.4/xlator/protocol/client.so [0x64a215]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/glusterfs/1.3.0-pre5.4/xlator/protocol/client.so(notify+0x446) [0x647ef6]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/libglusterfs.so.0(transport_notify+0x37)
[0x2a9e17]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/libglusterfs.so.0(sys_epoll_iteration
+0xd9) [0x2aa939]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /usr/local/lib/libglusterfs.so.0(poll_iteration+0x1d)
[0x2a9eed]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: [glusterfsd] [0x8049250]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: /lib/libc.so.6(__libc_start_main+0xe0) [0x4748df70]
2007-07-18 16:15:35 C [common-utils.c:210:gf_print_trace]
debug-backtrace: [glusterfsd] [0x8048c31]

My server config is as follows:

   volume domainfs-ds
           type storage/posix                   # POSIX FS translator
           option directory /export/domainfs        # Export this
directory
   end-volume

   volume domainfs-ns
           type storage/posix                   # POSIX FS translator
           option directory /home/domainfs-ns        # Export this
directory
   end-volume

   volume domainfs-loadb1-ds
           type protocol/client
           option transport-type tcp/client
           option remote-host 192.168.0.2
           option remote-subvolume domainfs-ds
   end-volume

   volume domainfs-loadb1-ns
           type protocol/client
           option transport-type tcp/client
           option remote-host 192.168.0.2
           option remote-subvolume domainfs-ns
   end-volume

   volume domainfs-loadb2-ds
           type protocol/client
           option transport-type tcp/client
           option remote-host 192.168.0.3
           option remote-subvolume domainfs-ds
   end-volume

   volume domainfs-loadb2-ns
           type protocol/client
           option transport-type tcp/client
           option remote-host 192.168.0.3
           option remote-subvolume domainfs-ns
   end-volume

   volume domainfs-ns-afr
           type cluster/afr
           # There appears to be a bug with AFR and Local Posix Volumes.
           # To get around this we pretend the local volume is remote
with an extra client volume named domainfs-santa1-ns.
           # subvolumes domainfs-ns domainfs-santa2-ns
domainfs-santa3-ns
           subvolumes domainfs-loadb1-ns domainfs-loadb2-ns
           option replicate *:2
   end-volume

   volume domainfs-ds-afr
           type cluster/afr
           # There appears to be a bug with AFR and Local Posix Volumes.
           # To get around this we pretend the local volume is remote
with an extra client volume named domainfs-santa1-ds.
           subvolumes domainfs-loadb1-ds domainfs-loadb2-ds
           option replicate *:2
   end-volume

   volume domainfs-unify
           type cluster/unify
           subvolumes domainfs-ds-afr
           option namespace domainfs-ns-afr
           option scheduler rr
   end-volume

   volume domainfs
           type performance/io-threads
           option thread-count 8
           option cache-size 64MB
           subvolumes domainfs-unify
   end-volume

   ### Add network serving capability to above brick.
   volume server
     type protocol/server
     option transport-type tcp/server     # For TCP/IP transport
     subvolumes domainfs
     option auth.ip.domainfs-ds.allow 192.168.0.*,127.0.0.1 # Allow
access to "brick" volume
     option auth.ip.domainfs-ns.allow 192.168.0.*,127.0.0.1 # Allow
access to "brick" volume
     option auth.ip.domainfs.allow * # Allow access to "brick" volume
   end-volume

My client config:

   ### Add client feature and attach to remote subvolume
   volume loadb
     type protocol/client
     option transport-type tcp/client     # for TCP/IP transport
     option remote-host 192.168.0.3       # IP address of the remote
brick (points to localhost on both box's)
     option remote-subvolume domainfs        # name of the remote volume
   end-volume

#   volume loadb2
#     type protocol/client
#     option transport-type tcp/client     # for TCP/IP transport
#     option remote-host 192.168.0.3       # IP address of the remote
brick
#     option remote-subvolume domainfs        # name of the remote
volume
#   end-volume

   ### Add writeback feature
   volume writeback
     type performance/write-behind
     option aggregate-size 131072 # unit in bytes
     subvolumes loadb
   end-volume

   ### Add readahead feature
   volume readahead
     type performance/read-ahead
     option page-size 65536     # unit in bytes
     option page-count 16       # cache per file  = (page-count x
page-size)
     subvolumes writeback
   end-volume