[Bugs] [Bug 1263585] Data Tiering:new crash seen with tier rebalance deamon

Wed Sep 16 09:16:37 UTC 2015

https://bugzilla.redhat.com/show_bug.cgi?id=1263585

Nithya Balachandran <nbalacha at redhat.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |nbalacha at redhat.com

--- Comment #2 from Nithya Balachandran <nbalacha at redhat.com> ---
Analysis:

The bricklists have an extra entry at the end. I have verified this on a
running tier process for a newly created volume as well .

Core was generated by `/usr/sbin/glusterfs -s localhost --volfile-id
rebalance/gold --xlator-option *d'.
Program terminated with signal 11, Segmentation fault.
#0  0x00007ff8f9f82af0 in syncop_ipc (subvol=0x7ff7a2de5700, op=op at entry=1,
xdata_in=xdata_in at entry=0x0, 
    xdata_out=xdata_out at entry=0x0) at syncop.c:2823
2823            SYNCOP (subvol, (&args), syncop_ipc_cbk, subvol->fops->ipc,

(gdb) bt
#0  0x00007ff8f9f82af0 in syncop_ipc (subvol=0x7ff7a2de5700, op=op at entry=1,
xdata_in=xdata_in at entry=0x0, 
    xdata_out=xdata_out at entry=0x0) at syncop.c:2823
#1  0x00007ff8e7919fb9 in tier_process_brick_cbk (args=<synthetic pointer>,
local_brick=0x7ff8db43bc60) at tier.c:548
#2  tier_build_migration_qfile (is_promotion=_gf_false,
query_cbk_args=0x7ff79fddee70, args=0x7ff8db43bcc0) at tier.c:608
#3  tier_demote (args=0x7ff8db43bcc0) at tier.c:668
#4  0x00007ff8f8d99df5 in start_thread () from /lib64/libpthread.so.0
#5  0x00007ff8f86e01ad in clone () from /lib64/libc.so.6

(gdb) p *subvol
$1 = {name = 0x7ff7a2de5700 "", type = 0x7ff8cc037460 "\002", next =
0x7ff7a2de5700, prev = 0x1, parents = 0x0, 
  children = 0x5fd8b3b6b4645700, options = 0x86286dc0e2942214, dlhandle = 0x0,
fops = 0x0, cbks = 0x0, dumpops = 0x0, 
  volume_options = {next = 0x0, prev = 0x0}, fini = 0x0, init = 0x0,
reconfigure = 0x0, mem_acct_init = 0x0, notify = 0x0, 
  loglevel = GF_LOG_NONE, latencies = {{min = 0, max = 0, total = 0, std = 0,
mean = 0, count = 0} <repeats 11 times>, {min = 0, 
      max = 0, total = 0, std = 6.9515818434295051e-310, mean =
6.9515814287748329e-310, count = 140527034957824}, {
      min = 140701566130656, max = 140701566130656, total =
-nan(0xfffffffffffe0), std = 0, mean = 0, count = 140701566127856}, {
      min = 16, max = 1, total = 0, std = 0, mean = 0, count = 1}, {min = 0,
max = 0, total = 0, std = 0, mean = 0, count = 0}, {
      min = 0, max = 0, total = 0, std = 0, mean = 0, count = 0}, {min = 0, max
= 0, total = 0, std = 0, mean = 0, count = 0}, {
      min = 0, max = 0, total = 0, std = 0, mean = 0, count = 0}, {min = 0, max
= 0, total = 0, std = 0, mean = 0, count = 0}, {
      min = 0, max = 0, total = 0, std = 0, mean = 0, count = 0}, {min = 0, max
= 0, total = 0, std = 0, mean = 0, count = 0}, {
      min = 0, max = 0, total = 0, std = 0, mean = 0, count = 0}, {min = 0, max
= 0, total = 0, std = 0, mean = 0, count = 0}, {
      min = 0, max = 0, total = 0, std = 0, mean = 0, count = 140701566130704},
{min = 0, max = 0, total = 0, std = 0, mean = 0, 
      count = 0}, {min = 0, max = 0, total = 0, std = 0, mean = 0, count = 0},
{min = 0, max = 0, total = 0, std = 0, mean = 0, 
      count = 0}, {min = 0, max = 0, total = 0, std = 0, mean = 0, count = 0},
{min = 0, max = 0, total = 0, std = 0, mean = 0, 
      count = 0}, {min = 0, max = 0, total = 0, std = 1.4308692302641265e-308,
mean = 0, count = 0}, {min = 0, 
      max = 140707013696768, total = 6.9518399606381148e-310, std = 0, mean =
0, count = 0}, {min = 0, max = 0, total = 0, 
      std = 0, mean = 0, count = 140701557739520}, {min = 8392704, max = 4096,
total = 2.0236928853657458e-320, std = 0, 
      mean = 0, count = 0}, {min = 0, max = 0, total = 0, std = 0, mean = 0,
count = 0} <repeats 17 times>}, history = 0x0, 
  ctx = 0x0, graph = 0x0, itable = 0x0, init_succeeded = 0 '\000', private =
0x0, mem_acct = 0x0, winds = 0, 
  switched = 0 '\000', local_pool = 0x0, is_autoloaded = _gf_false}
(gdb) 

The subvol is not a valid xlator. Dereferencing this causes the crash.

(gdb) f 2
#2  tier_build_migration_qfile (is_promotion=_gf_false,
query_cbk_args=0x7ff79fddee70, args=0x7ff8db43bcc0) at tier.c:608
608                    ret = tier_process_brick_cbk (local_brick,
(gdb) l
603            gfdb_brick_dict_info.time_stamp = &time_in_past;
604            gfdb_brick_dict_info._gfdb_promote = is_promotion;
605            gfdb_brick_dict_info._query_cbk_args = query_cbk_args;
606    
607            list_for_each_entry (local_brick, args->brick_list, list) {
608                    ret = tier_process_brick_cbk (local_brick,
609                                                    &gfdb_brick_dict_info);
610                    if (ret) {
611                            gf_msg (args->this->name, GF_LOG_ERROR, 0,
612                                    DHT_MSG_BRICK_QUERY_FAILED,
(gdb) 
613                                    "Brick query failed\n");
614                            goto out;
615                    }
616            }

(gdb) p *args
$2 = {this = 0x0, defrag = 0x0, brick_list = 0x0, freq_time = 0, return_value =
0}

args is NULL at this point because of the same issue as in BZ 1263200. However
it looks like the demote thread had already started processing the brick list
before the args were cleared.

On switching to the tier_start thread and traversing bricklist_hot:

 (gdb) bt
#0  0x00007ff8f8da099d in nanosleep () from /lib64/libpthread.so.0
#1  0x00007ff8f9f50914 in gf_timer_proc (ctx=0x7ff8fb371010) at timer.c:205
#2  0x00007ff8f8d99df5 in start_thread () from /lib64/libpthread.so.0
#3  0x00007ff8f86e01ad in clone () from /lib64/libc.so.6
(gdb) t 3
[Switching to thread 3 (Thread 0x7ff8ef549700 (LWP 32723))]
#0  0x00007ff8f86a748d in nanosleep () from /lib64/libc.so.6
(gdb) bt
#0  0x00007ff8f86a748d in nanosleep () from /lib64/libc.so.6
#1  0x00007ff8f86a7324 in sleep () from /lib64/libc.so.6
#2  0x00007ff8e791af2a in tier_start (this=0x7ff8e8020920,
defrag=0x7ff8e8028c10) at tier.c:860
#3  0x00007ff8e7dafd34 in gf_defrag_start_crawl (data=0x7ff8e8020920) at
dht-rebalance.c:2841
#4  0x00007ff8f9f72d72 in synctask_wrap (old_task=<optimized out>) at
syncop.c:380
#5  0x00007ff8f86310f0 in ?? () from /lib64/libc.so.6
#6  0x0000000000000000 in ?? ()

(gdb) f 2
#2  0x00007ff8e791af2a in tier_start (this=0x7ff8e8020920,
defrag=0x7ff8e8028c10) at tier.c:860
860                    sleep(1);

(gdb) p *((brick_list_t *)((char *)(bricklist_hot->next)-(unsigned
long)(&((brick_list_t *)0)->list)))
$3 = {xlator = 0x7ff8e8009c70, brick_db_path = 0x7ff8cc00c260
"/rhs/brick6/gold_hot/.glusterfs/gold_hot.db", list = {
    next = 0x7ff8cc000db0, prev = 0x7ff8db43bc70}}

(gdb) p *((brick_list_t *)((char *)(bricklist_hot->next->next)-(unsigned
long)(&((brick_list_t *)0)->list)))
$4 = {xlator = 0x7ff8e800be70, brick_db_path = 0x7ff8cc00d2b0
"/rhs/brick7/gold_hot/.glusterfs/gold_hot.db", list = {
    next = 0x7ff8db43bc70, prev = 0x7ff8cc000d40}}

(gdb) p *((brick_list_t *)((char *)(bricklist_hot->next->next->next)-(unsigned
long)(&((brick_list_t *)0)->list)))
$5 = {xlator = 0x7ff7a2de5700, brick_db_path = 0x7ff79fddf700 "", list = {next
= 0x7ff8cc000d40, prev = 0x7ff8cc000db0}} =====> Invalid xlator

(gdb) p *((brick_list_t *)((char
*)(bricklist_hot->next->next->next->next)-(unsigned long)(&((brick_list_t
*)0)->list)))
$6 = {xlator = 0x7ff8e8009c70, brick_db_path = 0x7ff8cc00c260
"/rhs/brick6/gold_hot/.glusterfs/gold_hot.db", list = {
    next = 0x7ff8cc000db0, prev = 0x7ff8db43bc70}}  ==> First entry

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are on the CC list for the bug.
You are the assignee for the bug.