[Gluster-users] Afr crashing quite frequently - very unstable

Vikas R vicky.ice at gmail.com
Fri Jan 14 07:12:46 UTC 2011


Hi,

Im am unable to run gluster client in my setup. Afr with distribute crashes
randomly , seems like 3.1.0 is very unstable.

Stack:

[2011-01-14 06:57:28.726872] E
[afr-self-heal-algorithm.c:762:sh_diff_checksum_cbk] replicate-1: checksum
on
/streaming/set11/out/multiple_reduce.flash_pl.2.1294988017.1.172.26.98.55.2.gz
failed on subvolume distribute-2 (File descriptor in bad state)
[2011-01-14 06:57:28.726901] E
[afr-self-heal-algorithm.c:762:sh_diff_checksum_cbk] replicate-1: checksum
on
/streaming/set11/out/multiple_reduce.flash_pl.2.1294988017.1.172.26.98.55.2.gz
failed on subvolume distribute-1 (File descriptor in bad state)
[2011-01-14 06:57:28.726913] E
[afr-self-heal-algorithm.c:956:sh_diff_loop_driver] replicate-1: diff
 meta-data data self-heal aborting on
/streaming/set11/out/multiple_reduce.flash_pl.2.1294988017.1.172.26.98.55.2.gz
[2011-01-14 06:57:28.726962] E
[afr-self-heal-algorithm.c:762:sh_diff_checksum_cbk] replicate-1: checksum
on
/streaming/set11/out/multiple_reduce.flash_pl.2.1294988017.1.172.26.98.55.2.gz
failed on subvolume distribute-2 (File descriptor in bad state)
[2011-01-14 06:57:28.726974] E
[afr-self-heal-algorithm.c:762:sh_diff_checksum_cbk] replicate-1: checksum
on
/streaming/set11/out/multiple_reduce.flash_pl.2.1294988017.1.172.26.98.55.2.gz
failed on subvolume distribute-1 (File descriptor in bad state)
[2011-01-14 06:57:28.726984] E
[afr-self-heal-algorithm.c:956:sh_diff_loop_driver] replicate-1: diff
 meta-data data self-heal aborting on
/streaming/set11/out/multiple_reduce.flash_pl.2.1294988017.1.172.26.98.55.2.gz
[2011-01-14 06:57:28.727192] E [mem-pool.c:264:__gf_free]
(-->/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so [0xf613349d]
(-->/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so [0xf613181f]
(-->/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so
[0xf61316a6]))) : Assertion failed: 0
pending frames:
frame : type(1) op(LOOKUP)
frame : type(1) op(LOOKUP)

patchset: v3.1.0
signal received: 11
time of crash: 2011-01-14 06:57:28
configuration details:
argp 1
backtrace 1
dlfcn 1
fdatasync 1
libpthread 1
llistxattr 1
setfsid 1
spinlock 1
epoll.h 1
xattr.h 1
st_atim.tv_nsec 1
package-string: glusterfs 3.1.0
[0xffffe400]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so[0xf61316a6]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so[0xf613181f]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so[0xf613349d]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so[0xf613387d]
/usr/local/akamai/lib/libglusterfs.so.0(default_rchecksum_cbk+0x79)[0xf76f3979]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/protocol/client.so(client3_1_rchecksum+0x1b4)[0xf6181594]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/protocol/client.so(client_rchecksum+0x93)[0xf617c033]
/usr/local/akamai/lib/libglusterfs.so.0(default_rchecksum+0xd9)[0xf76edcb9]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so[0xf6131fbd]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so(afr_sh_algo_diff+0x14d)[0xf613248d]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so(afr_sh_data_sync_prepare+0x11a)[0xf61225fa]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so(afr_sh_data_fix+0x29c)[0xf612298c]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/afr.so(afr_sh_data_fstat_cbk+0xf8)[0xf6122ce8]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/cluster/dht.so(dht_attr_cbk+0xff)[0xf615d7cf]
/usr/local/akamai/lib/glusterfs/3.1.0/xlator/protocol/client.so(client3_1_fstat_cbk+0x331)[0xf6190bb1]
/usr/local/akamai/lib/libgfrpc.so.0(rpc_clnt_handle_reply+0xc2)[0xf76cdc42]
/usr/local/akamai/lib/libgfrpc.so.0(rpc_clnt_notify+0xa2)[0xf76cde62]
/usr/local/akamai/lib/libgfrpc.so.0(rpc_transport_notify+0x35)[0xf76c84c5]
/usr/local/akamai/lib/glusterfs/3.1.0/rpc-transport/socket.so(socket_event_poll_in+0x50)[0xf5e9f500]
/usr/local/akamai/lib/glusterfs/3.1.0/rpc-transport/socket.so(socket_event_handler+0x15b)[0xf5e9f67b]
/usr/local/akamai/lib/libglusterfs.so.0[0xf7708cff]
/usr/local/akamai/lib/libglusterfs.so.0(event_dispatch+0x21)[0xf7707a21]
glusterfsc(main+0x48c)[0x804c45c]
/lib/tls/i686/cmov/libc.so.6(__libc_start_main+0xdc)[0xf756e18c]
glusterfsc[0x804a631

attaching config files.

tx
Vikas
-------------- next part --------------
## file auto generated by /usr/local/bin/glusterfs-volgen (export.vol)
# Cmd line:
# $ /usr/local/bin/glusterfs-volgen --name gfs 172.24.0.68:/ghostcache/home/hsawhney/gfs/ 172.24.0.222:/ghostcache/home/hsawhney/gfs/

volume posix1
  type storage/posix
  option directory /ghostcache/gfs-export/
end-volume

volume locks1
    type features/locks
    subvolumes posix1
end-volume

#volume quota1
#    type features/quota
#    #option disk-usage-limit 100MB
#    subvolumes locks1
#end-volume

volume brickex
    type performance/io-threads
    option thread-count 4
    subvolumes locks1
end-volume

volume server-tcp
    type protocol/server
    option transport-type tcp
    option auth.addr.brickex.allow *
    option transport.socket.listen-port 6996
    option transport.socket.nodelay on
    subvolumes brickex
end-volume
-------------- next part --------------
# file auto generated by /usr/local/bin/glusterfs-volgen (mount.vol)
# Cmd line:
# $ /usr/local/bin/glusterfs-volgen --name gfs 172.24.0.68:/ghostcache/home/hsawhney/gfs/ 172.24.0.222:/ghostcache/home/hsawhney/gfs/

# TRANSPORT-TYPE tcp
volume 172.26.98.55-1
    type protocol/client
    option transport-type tcp
    option remote-host 172.26.98.55
    option transport.socket.nodelay on
    option transport.remote-port 6996
    option remote-subvolume brickex
end-volume

volume 172.26.98.56-1
    type protocol/client
    option transport-type tcp
    option remote-host 172.26.98.56
    option transport.socket.nodelay on
    option transport.remote-port 6996
    option remote-subvolume brickex
end-volume

volume 172.26.98.57-1
    type protocol/client
    option transport-type tcp
    option remote-host 172.26.98.57
    option transport.socket.nodelay on
    option transport.remote-port 6996
    option remote-subvolume brickex
end-volume

volume 172.26.98.59-1
    type protocol/client
    option transport-type tcp
    option remote-host 172.26.98.59
    option transport.socket.nodelay on
    option transport.remote-port 6996
    option remote-subvolume brickex
end-volume

#volume 172.26.98.61-1
#    type protocol/client
#    option transport-type tcp
#    option remote-host 172.26.98.61
#    option transport.socket.nodelay on
#    option transport.remote-port 6996
#    option remote-subvolume brickex
#end-volume

#volume 172.26.98.62-1
#    type protocol/client
#    option remote-host 172.26.98.62
#    option transport.socket.nodelay on
#    option transport.remote-port 6996
#    option remote-subvolume brickex
#end-volume

volume distribute-1
    type cluster/dht
    subvolumes 172.26.98.55-1 172.26.98.56-1
end-volume

volume distribute-2
    type cluster/dht
    subvolumes 172.26.98.57-1 172.26.98.59-1
end-volume

#volume distribute-3
#    type cluster/dht
#    subvolumes 172.26.98.61-1 172.26.98.62-1
#end-volume

volume replicate-1
    type cluster/afr
    option lookup-unhashed yes
    subvolumes distribute-1 distribute-2
    #subvolumes distribute-1 distribute-2 distribute-3
end-volume

#volume stripe
#    type cluster/stripe
#    option block-size 1MB
#    subvolumes replicate-1 replicate-2 replicate-3
#end-volume

volume writebehind
    type performance/write-behind
    option cache-size 4MB
    subvolumes replicate-1
end-volume

volume io-cache
  type performance/io-cache
  option cache-size 64MB             # default is 32MB
  #option priority *.h:3,*.html:2,*:1 # default is '*:0'
  option cache-timeout 2             # default is 1 second
  subvolumes writebehind
end-volume

volume stat-prefetch
  type performance/stat-prefetch
  subvolumes io-cache
end-volume







More information about the Gluster-users mailing list