[Gluster-devel] Re: afr :2 HA setup question

Tue Sep 11 08:26:49 UTC 2007

Hi Amar,

yes, I'm using afr and  do a unify over afr.

Here are my configs.

---snip---
Serverside: gluster3 example (configs from gluster1-2-4 are similar)

### file: server-volume.spec
# Namespace brick
volume brick
      type storage/posix                     # POSIX FS translator
      option directory /var/tmp         # Export this directory
end-volume

volume server
        type protocol/server
        option transport-type tcp/server 
        subvolumes brick
        option auth.ip.brick.allow *
end-volume

#
# local storage bricks
#

volume local-hdb1
        type storage/posix                  
        option directory /export/hdb1      
end-volume

volume local-sda1
        type storage/posix                  
        option directory /export/sda1        
end-volume

volume local-sdb1
        type storage/posix                  
        option directory /export/sdb1       
end-volume

volume local-lvm
        type storage/posix                  
        option directory /export/lvm      
end-volume

#
# performance translators
#

volume hdb1
        type performance/io-threads
        option thread-count 8
        subvolumes local-hdb1
end-volume

volume sda1
        type performance/io-threads
        option thread-count 8
        subvolumes local-sda1
end-volume

volume sdb1
        type performance/io-threads
        option thread-count 8
        subvolumes local-sdb1
end-volume

volume lvm
        type performance/io-threads
        option thread-count 8
        subvolumes local-lvm
end-volume

volume server
        type protocol/server
        option transport-type tcp/server     # For TCP/IP transport
        option listen-port 6997                   # Default is 6996
         subvolumes hdb1 sda1 sdb1
        option auth.ip.hdb1.allow * # Allow access to "brick" volume
        option auth.ip.sda1.allow * # Allow access to "brick" volume
        option auth.ip.sdb1.allow * # Allow access to "brick" volume
        option auth.ip.lvm.allow * # Allow access to "brick" volume
end-volume
---snap---

---snip---
Clientside

#
# glusterfs Client Configuration
#

#
# Namespace Volume
#

volume brick
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster1     # IP address of the remote brick
  option remote-port 6996
  option remote-subvolume brick        # name of the remote volume
end-volume

# Remote Volumes from gluster1 - gluster4

#
# gluster1
#

volume gluster1-sdb1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster1     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume sdb1        # name of the remote volume
end-volume

volume gluster1-sdc1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster1     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume sdc1        # name of the remote volume
end-volume

volume gluster1-sdd1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster1     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume sdd1        # name of the remote volume
end-volume

volume gluster1-sde1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster1     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume sde1        # name of the remote volume
end-volume

volume gluster1-sdf1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster1     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume sdf1        # name of the remote volume
end-volume

#
# gluster2
#

volume gluster2-hdb1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster2     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume hdb1        # name of the remote volume
end-volume

volume gluster2-hdc1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster2     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume hdc1        # name of the remote volume
end-volume

#
# gluster3
#

volume gluster3-hdb1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster3     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume hdb1        # name of the remote volume
end-volume

volume gluster3-sda1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster3     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume sda1        # name of the remote volume
end-volume

volume gluster3-sdb1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster3     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume sdb1        # name of the remote volume
end-volume

volume gluster3-lvm
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster3     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume lvm        # name of the remote volume
end-volume

#
# gluster4
#

volume gluster4-hdc1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster4     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume hdc1        # name of the remote volume
end-volume

volume gluster4-hdb1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster4     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume hdb1        # name of the remote volume
end-volume

volume gluster4-sda1
  type protocol/client
  option transport-type tcp/client     # for TCP/IP transport
  option remote-host gluster4     # IP address of the remote brick
  option remote-port 6997
  option remote-subvolume sda1        # name of the remote volume
end-volume

#
# Replication Transplaters
# AFR (Automatic File Replication )
#

volume afr1
  type cluster/afr
  subvolumes gluster3-hdb1 gluster4-hdc1
  option replicate *:2
end-volume

volume afr2
  type cluster/afr
  subvolumes gluster2-hdc1 gluster3-lvm
  option replicate *:2
end-volume

volume afr3
  type cluster/afr
  subvolumes gluster1-sde1 gluster1-sdf1
  option replicate *:2
end-volume

volume afr4
  type cluster/afr
  subvolumes gluster3-sda1 gluster3-sdb1
  option replicate *:2
end-volume

#
# Unify all gluster servers together to ONE share
#

volume cluster
  type cluster/unify
  subvolumes afr1 afr2 afr3 afr4 gluster2-hdb1 gluster4-hdb1 
gluster4-sda1 gluster4-hdc1
  option scheduler alu   # use the ALU scheduler
  option alu.limits.min-free-disk  6GB   # Don't create files one a 
volume with less than 6GB free diskspace
  option alu.limits.max-open-files 10000   # Don't create files on a 
volume with more than 10000 files open
  option namespace brick
  option alu.order 
disk-usage:read-usage:write-usage:open-files-usage:disk-speed-usage
  option alu.disk-usage.entry-threshold 100GB   # Kick in if the 
discrepancy in disk-usage between volumes is 2GB
  option alu.disk-usage.exit-threshold  60MB   # Don't stop until you've 
written at least 60MB to the least-used volume
  option alu.open-files-usage.entry-threshold 1024   # Kick in if the 
discrepancy in open files is 1024
  option alu.open-files-usage.exit-threshold 32   # Don't stop until 
you've written at least 32 files to the least-used volume
  option alu.stat-refresh.interval 10sec   # Refresh the statistics used 
for decision-making every 10 seconds

end-volume

#
# Performance Translator
# writebehind improves write performance a lot
#

volume writebehind
  type performance/write-behind
  option aggregate-size 131072 # unit in bytes
  option flush-behind off
  subvolumes cluster
end-volume

#
# Add readahead feature
#

volume readahead
  type performance/read-ahead
  option page-size 1MB     # unit in bytes
  option page-count 2       # cache per file  = (page-count x page-size)
  subvolumes writebehind
end-volume

volume io-perf
  type performance/io-cache
  option page-size 128KB
  option page-count 128
  subvolumes readahead
end-volume

---snap---

gluster4 had the hardware problems ... ups.... oh :-). I see that my 
subvolumes on client side has not only the afr volumes included but also 
single volumes from gluster4. That means, if gluster4 isn't responding 
(e.g. has no network connection, or whatever), and any client is writing 
e.g. to gluster4-sda1 or gluster4-hdc1 as you can see in my example -> 
it is absolutly normal, that my client will hang, because it has nothing 
to do with afr1, afr2 or afr3...

So in my case it is a configuration problem or a error in reasoning.

Or am I wrong?

Regards,

  Matthias

Amar S. Tumballi schrieb:
> Hi Matthias,
>  Can I have a look at your spec file? Btw, are you using AFR? unify? 
> or unify over afr? because if the node which went down had unify's 
> namespace and if it was not afr'd, then there is a high chance it can 
> happen.
>
> Anyways, posting your spec files may help us to solve your problem.
>
> Regards,
> Amar
>
> On 9/11/07, *Matthias Albert* < gluster at linux4experts.de 
> <mailto:gluster at linux4experts.de>> wrote:
>
>     Hi August,
>
>     I can confirm your problem with your setup. I' ve a 4 Server
>     glusterfsd
>     setup also with  1.3.1 running and some glusterfs clients with
>     fuse glfs3.
>
>     One of these 4 servers had a hardware failure and was no longer
>     reachable -> so the side effect was, that all of my glusterfs Clients
>     couldn't write anything in the mounted glusterfs share. I've build
>     a new
>     test Machine changed the old one with this new machine. Probably this
>     week, I have more time for playing and testing with glusterfs
>     (also with
>     some performance translators).
>
>     I will test the "option transport-timeout X" and will see what
>     happen if
>     I take one of them of the net.
>
>     Regards,
>
>        Matthias
>
>     August R. Wohlt schrieb:
>     > Hi all -
>     >
>     > After combing through the archives, I found the transport-timeout
>     > option mentioned by avati. Is this described in the wiki docs
>     > anywhere? I thought I had read through every page, but don't recall
>     > seeing it. The e-mail from avati mentioned that it was described in
>     > "doc/translator-options.txt" but this file does not appear in my
>     > glusterfs-1.3.1 tarball.
>     >
>     > In any case, for those who have similar issues, making transport
>     > timeout much smaller is your friend :-)
>     >
>     > Many Thanks!!
>     > :august
>     >
>     > On 9/10/07, August R. Wohlt <glusterfs at isidore.net
>     <mailto:glusterfs at isidore.net>> wrote:
>     >
>     >> Hi devs et al,
>     >>
>     >> After many hours of sublimation, I was able to condense my
>     previous hanging
>     >> issue down to this simplest case.
>     >>
>     >> To summarize: I have two physical machines, each afr'ing a
>     directory to the
>     >> other. both are glusterfs(d) 1.3.1 with glfs3 fuse. iptables is
>     suspended
>     >> during these tests. Spec files are below.
>     >>
>     >> The four situations:
>     >>
>     >> 1) If I start up both machines and start up glusterfsd on both
>     machines, I
>     >> can mount either one from the other and view its files as expected.
>     >>
>     >> 2) If I start up only one machine and glusterfsd, I can mount that
>     >> glusterfsd brick from the same machine and use it (ie edit the
>     files) while
>     >> it tries to connect to the 2nd machine in the background. When
>     I bring up
>     >> the 2nd machine, it connects and afrs as expected. Compare this
>     to #4).
>     >>
>     >> 3) If I start up both machines and glusterfsd on both, mount
>     each others'
>     >> bricks, verify I can see the files and then kill glusterfsd on
>     one of them,
>     >> I can still use and view files on the other one while it tries
>     to reconnect
>     >> in the background to the glusterfsd that was killed. When it
>     comes back up
>     >> everything continues as expected.
>     >>
>     >> 4) But, if I startup both machines with glusterfsd on both,
>     mount either
>     >> brick and view the files and then bring down the other machine
>     (ie not kill
>     >> glusterfsd, but bring down the whole machine suddenly, or pull
>     the ethernet
>     >> cable) , I can no longer see any files on the remaining
>     machine. It just
>     >> hangs until the machine that is down comes back up and then it
>     continues on
>     >> its merry way.
>     >>
>     >> This is presumably not the expected behavior since it is not
>     the behavior in
>     >> 2) and 3). It is only after the machines have both started up
>     and then one
>     >> of them goes away that I see this problem. Obviously, however
>     this is the
>     >> very situation that calls for an HA setup in the real world.
>     When one server
>     >> goes offline suddenly, you want to be able to keep on using the
>     first.
>     >>
>     >> Here is the simplest spec file configuration that exhibits this
>     problem:
>     >>
>     >> Simple server configuration:
>     >>
>     >> volume brick-ds
>     >>     type storage/posix
>     >>     option directory /.brick-ds
>     >> end-volume
>     >>
>     >>  volume brick-ds-afr
>     >>     type storage/posix
>     >>     option directory /.brick-ds-afr
>     >> end-volume
>     >>
>     >> volume server
>     >>     type protocol/server
>     >>     option transport-type tcp/server
>     >>     option bind-address 192.168.16.128 <http://192.168.16.128>
>     # 192.168.16.1 <http://192.168.16.1> on the other server
>     >>     subvolumes brick-ds brick-ds-afr
>     >>     option auth.ip.brick-ds.allow 192.168.16.*
>     >>     option auth.ip.brick-ds-afr.allow 192.168.16.*
>     >> end-volume
>     >>
>     >>
>     >> Client Configuration :
>     >>
>     >>    volume brick-ds-local
>     >>      type protocol/client
>     >>      option transport-type tcp/client
>     >>      option remote-host 192.168.16.128 <http://192.168.16.128>
>     # 192.168.16.1 <http://192.168.16.1> on the other machine
>     >>      option remote-subvolume brick-ds
>     >>    end-volume
>     >>
>     >>    volume brick-ds-remote
>     >>       type protocol/client
>     >>       option transport-type tcp/client
>     >>       option remote-host 192.168.16.1 <http://192.168.16.1> #
>     192.168.16.128 <http://192.168.16.128> on the other machine
>     >>       option remote-subvolume brick-ds-afr
>     >>     end-volume
>     >>
>     >>      volume brick-ds-afr
>     >>       type cluster/afr
>     >>       subvolumes brick-ds-local brick-ds-remote
>     >>       option replicate *:2
>     >>     end-volume
>     >>
>     >> These are both stock CentOS/RHEL 5 machines. You can
>     demonstrate the
>     >> behavior by rebooting one machine, pulling out the ethernet
>     cable, or
>     >> sending the route out into space (ie route add -host
>     192.168.16.1 <http://192.168.16.1>
>     >> some_disconnected_device). Everything will be frozen until the
>     connection
>     >> returns and then when it comes back up, things keep working
>     again after
>     >> that.
>     >>
>     >> Because of this problem, any kind of  HA / unify setup will not
>     work for me
>     >> when one of the nodes fails.
>     >>
>     >> Can someone else verify this behavior? If there is some part of
>     the logs /
>     >> strace / gdb output you'd like to see , just let me know. I'd
>     really like to
>     >> use glusterfs in an HA setup, but don't see how with this behavior.
>     >>
>     >> Thanks in advance!!
>     >> :august
>     >>
>     >>
>     >> On 9/7/07, August R. Wohlt < glusterfs at isidore.net
>     <mailto:glusterfs at isidore.net>> wrote:
>     >>
>     >>> Hi all -
>     >>>
>     >>> I have a setup based on this :
>     >>>
>     >>>
>     >>  http://www.gluster.org/docs/index.php/GlusterFS_High_Availability_Storage_with_GlusterFS
>     <http://www.gluster.org/docs/index.php/GlusterFS_High_Availability_Storage_with_GlusterFS>
>     >>
>     >>> but with only 2 machines. Effectively just a mirror (glusterfsd
>     >>>
>     >> configuration below). 1.3.1 client and server.
>     >>
>     >>>
>     >>
>     >
>     >
>     > _______________________________________________
>     > Gluster-devel mailing list
>     > Gluster-devel at nongnu.org <mailto:Gluster-devel at nongnu.org>
>     > http://lists.nongnu.org/mailman/listinfo/gluster-devel
>     >
>
>
>
>     _______________________________________________
>     Gluster-devel mailing list
>     Gluster-devel at nongnu.org <mailto:Gluster-devel at nongnu.org>
>     http://lists.nongnu.org/mailman/listinfo/gluster-devel
>
>
>
>
> -- 
> Amar Tumballi
> Engineer - Gluster Core Team
> [bulde on #gluster/irc.gnu.org]
> http://www.zresearch.com - Commoditizing Supercomputing and Superstorage!