[Gluster-devel] Re: afr :2 HA setup question
Matthias Albert
gluster at linux4experts.de
Tue Sep 11 08:26:49 UTC 2007
Hi Amar,
yes, I'm using afr and do a unify over afr.
Here are my configs.
---snip---
Serverside: gluster3 example (configs from gluster1-2-4 are similar)
### file: server-volume.spec
# Namespace brick
volume brick
type storage/posix # POSIX FS translator
option directory /var/tmp # Export this directory
end-volume
volume server
type protocol/server
option transport-type tcp/server
subvolumes brick
option auth.ip.brick.allow *
end-volume
#
# local storage bricks
#
volume local-hdb1
type storage/posix
option directory /export/hdb1
end-volume
volume local-sda1
type storage/posix
option directory /export/sda1
end-volume
volume local-sdb1
type storage/posix
option directory /export/sdb1
end-volume
volume local-lvm
type storage/posix
option directory /export/lvm
end-volume
#
# performance translators
#
volume hdb1
type performance/io-threads
option thread-count 8
subvolumes local-hdb1
end-volume
volume sda1
type performance/io-threads
option thread-count 8
subvolumes local-sda1
end-volume
volume sdb1
type performance/io-threads
option thread-count 8
subvolumes local-sdb1
end-volume
volume lvm
type performance/io-threads
option thread-count 8
subvolumes local-lvm
end-volume
volume server
type protocol/server
option transport-type tcp/server # For TCP/IP transport
option listen-port 6997 # Default is 6996
subvolumes hdb1 sda1 sdb1
option auth.ip.hdb1.allow * # Allow access to "brick" volume
option auth.ip.sda1.allow * # Allow access to "brick" volume
option auth.ip.sdb1.allow * # Allow access to "brick" volume
option auth.ip.lvm.allow * # Allow access to "brick" volume
end-volume
---snap---
---snip---
Clientside
#
# glusterfs Client Configuration
#
#
# Namespace Volume
#
volume brick
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster1 # IP address of the remote brick
option remote-port 6996
option remote-subvolume brick # name of the remote volume
end-volume
# Remote Volumes from gluster1 - gluster4
#
# gluster1
#
volume gluster1-sdb1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster1 # IP address of the remote brick
option remote-port 6997
option remote-subvolume sdb1 # name of the remote volume
end-volume
volume gluster1-sdc1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster1 # IP address of the remote brick
option remote-port 6997
option remote-subvolume sdc1 # name of the remote volume
end-volume
volume gluster1-sdd1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster1 # IP address of the remote brick
option remote-port 6997
option remote-subvolume sdd1 # name of the remote volume
end-volume
volume gluster1-sde1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster1 # IP address of the remote brick
option remote-port 6997
option remote-subvolume sde1 # name of the remote volume
end-volume
volume gluster1-sdf1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster1 # IP address of the remote brick
option remote-port 6997
option remote-subvolume sdf1 # name of the remote volume
end-volume
#
# gluster2
#
volume gluster2-hdb1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster2 # IP address of the remote brick
option remote-port 6997
option remote-subvolume hdb1 # name of the remote volume
end-volume
volume gluster2-hdc1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster2 # IP address of the remote brick
option remote-port 6997
option remote-subvolume hdc1 # name of the remote volume
end-volume
#
# gluster3
#
volume gluster3-hdb1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster3 # IP address of the remote brick
option remote-port 6997
option remote-subvolume hdb1 # name of the remote volume
end-volume
volume gluster3-sda1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster3 # IP address of the remote brick
option remote-port 6997
option remote-subvolume sda1 # name of the remote volume
end-volume
volume gluster3-sdb1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster3 # IP address of the remote brick
option remote-port 6997
option remote-subvolume sdb1 # name of the remote volume
end-volume
volume gluster3-lvm
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster3 # IP address of the remote brick
option remote-port 6997
option remote-subvolume lvm # name of the remote volume
end-volume
#
# gluster4
#
volume gluster4-hdc1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster4 # IP address of the remote brick
option remote-port 6997
option remote-subvolume hdc1 # name of the remote volume
end-volume
volume gluster4-hdb1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster4 # IP address of the remote brick
option remote-port 6997
option remote-subvolume hdb1 # name of the remote volume
end-volume
volume gluster4-sda1
type protocol/client
option transport-type tcp/client # for TCP/IP transport
option remote-host gluster4 # IP address of the remote brick
option remote-port 6997
option remote-subvolume sda1 # name of the remote volume
end-volume
#
# Replication Transplaters
# AFR (Automatic File Replication )
#
volume afr1
type cluster/afr
subvolumes gluster3-hdb1 gluster4-hdc1
option replicate *:2
end-volume
volume afr2
type cluster/afr
subvolumes gluster2-hdc1 gluster3-lvm
option replicate *:2
end-volume
volume afr3
type cluster/afr
subvolumes gluster1-sde1 gluster1-sdf1
option replicate *:2
end-volume
volume afr4
type cluster/afr
subvolumes gluster3-sda1 gluster3-sdb1
option replicate *:2
end-volume
#
# Unify all gluster servers together to ONE share
#
volume cluster
type cluster/unify
subvolumes afr1 afr2 afr3 afr4 gluster2-hdb1 gluster4-hdb1
gluster4-sda1 gluster4-hdc1
option scheduler alu # use the ALU scheduler
option alu.limits.min-free-disk 6GB # Don't create files one a
volume with less than 6GB free diskspace
option alu.limits.max-open-files 10000 # Don't create files on a
volume with more than 10000 files open
option namespace brick
option alu.order
disk-usage:read-usage:write-usage:open-files-usage:disk-speed-usage
option alu.disk-usage.entry-threshold 100GB # Kick in if the
discrepancy in disk-usage between volumes is 2GB
option alu.disk-usage.exit-threshold 60MB # Don't stop until you've
written at least 60MB to the least-used volume
option alu.open-files-usage.entry-threshold 1024 # Kick in if the
discrepancy in open files is 1024
option alu.open-files-usage.exit-threshold 32 # Don't stop until
you've written at least 32 files to the least-used volume
option alu.stat-refresh.interval 10sec # Refresh the statistics used
for decision-making every 10 seconds
end-volume
#
# Performance Translator
# writebehind improves write performance a lot
#
volume writebehind
type performance/write-behind
option aggregate-size 131072 # unit in bytes
option flush-behind off
subvolumes cluster
end-volume
#
# Add readahead feature
#
volume readahead
type performance/read-ahead
option page-size 1MB # unit in bytes
option page-count 2 # cache per file = (page-count x page-size)
subvolumes writebehind
end-volume
volume io-perf
type performance/io-cache
option page-size 128KB
option page-count 128
subvolumes readahead
end-volume
---snap---
gluster4 had the hardware problems ... ups.... oh :-). I see that my
subvolumes on client side has not only the afr volumes included but also
single volumes from gluster4. That means, if gluster4 isn't responding
(e.g. has no network connection, or whatever), and any client is writing
e.g. to gluster4-sda1 or gluster4-hdc1 as you can see in my example ->
it is absolutly normal, that my client will hang, because it has nothing
to do with afr1, afr2 or afr3...
So in my case it is a configuration problem or a error in reasoning.
Or am I wrong?
Regards,
Matthias
Amar S. Tumballi schrieb:
> Hi Matthias,
> Can I have a look at your spec file? Btw, are you using AFR? unify?
> or unify over afr? because if the node which went down had unify's
> namespace and if it was not afr'd, then there is a high chance it can
> happen.
>
> Anyways, posting your spec files may help us to solve your problem.
>
> Regards,
> Amar
>
> On 9/11/07, *Matthias Albert* < gluster at linux4experts.de
> <mailto:gluster at linux4experts.de>> wrote:
>
> Hi August,
>
> I can confirm your problem with your setup. I' ve a 4 Server
> glusterfsd
> setup also with 1.3.1 running and some glusterfs clients with
> fuse glfs3.
>
> One of these 4 servers had a hardware failure and was no longer
> reachable -> so the side effect was, that all of my glusterfs Clients
> couldn't write anything in the mounted glusterfs share. I've build
> a new
> test Machine changed the old one with this new machine. Probably this
> week, I have more time for playing and testing with glusterfs
> (also with
> some performance translators).
>
> I will test the "option transport-timeout X" and will see what
> happen if
> I take one of them of the net.
>
> Regards,
>
> Matthias
>
> August R. Wohlt schrieb:
> > Hi all -
> >
> > After combing through the archives, I found the transport-timeout
> > option mentioned by avati. Is this described in the wiki docs
> > anywhere? I thought I had read through every page, but don't recall
> > seeing it. The e-mail from avati mentioned that it was described in
> > "doc/translator-options.txt" but this file does not appear in my
> > glusterfs-1.3.1 tarball.
> >
> > In any case, for those who have similar issues, making transport
> > timeout much smaller is your friend :-)
> >
> > Many Thanks!!
> > :august
> >
> > On 9/10/07, August R. Wohlt <glusterfs at isidore.net
> <mailto:glusterfs at isidore.net>> wrote:
> >
> >> Hi devs et al,
> >>
> >> After many hours of sublimation, I was able to condense my
> previous hanging
> >> issue down to this simplest case.
> >>
> >> To summarize: I have two physical machines, each afr'ing a
> directory to the
> >> other. both are glusterfs(d) 1.3.1 with glfs3 fuse. iptables is
> suspended
> >> during these tests. Spec files are below.
> >>
> >> The four situations:
> >>
> >> 1) If I start up both machines and start up glusterfsd on both
> machines, I
> >> can mount either one from the other and view its files as expected.
> >>
> >> 2) If I start up only one machine and glusterfsd, I can mount that
> >> glusterfsd brick from the same machine and use it (ie edit the
> files) while
> >> it tries to connect to the 2nd machine in the background. When
> I bring up
> >> the 2nd machine, it connects and afrs as expected. Compare this
> to #4).
> >>
> >> 3) If I start up both machines and glusterfsd on both, mount
> each others'
> >> bricks, verify I can see the files and then kill glusterfsd on
> one of them,
> >> I can still use and view files on the other one while it tries
> to reconnect
> >> in the background to the glusterfsd that was killed. When it
> comes back up
> >> everything continues as expected.
> >>
> >> 4) But, if I startup both machines with glusterfsd on both,
> mount either
> >> brick and view the files and then bring down the other machine
> (ie not kill
> >> glusterfsd, but bring down the whole machine suddenly, or pull
> the ethernet
> >> cable) , I can no longer see any files on the remaining
> machine. It just
> >> hangs until the machine that is down comes back up and then it
> continues on
> >> its merry way.
> >>
> >> This is presumably not the expected behavior since it is not
> the behavior in
> >> 2) and 3). It is only after the machines have both started up
> and then one
> >> of them goes away that I see this problem. Obviously, however
> this is the
> >> very situation that calls for an HA setup in the real world.
> When one server
> >> goes offline suddenly, you want to be able to keep on using the
> first.
> >>
> >> Here is the simplest spec file configuration that exhibits this
> problem:
> >>
> >> Simple server configuration:
> >>
> >> volume brick-ds
> >> type storage/posix
> >> option directory /.brick-ds
> >> end-volume
> >>
> >> volume brick-ds-afr
> >> type storage/posix
> >> option directory /.brick-ds-afr
> >> end-volume
> >>
> >> volume server
> >> type protocol/server
> >> option transport-type tcp/server
> >> option bind-address 192.168.16.128 <http://192.168.16.128>
> # 192.168.16.1 <http://192.168.16.1> on the other server
> >> subvolumes brick-ds brick-ds-afr
> >> option auth.ip.brick-ds.allow 192.168.16.*
> >> option auth.ip.brick-ds-afr.allow 192.168.16.*
> >> end-volume
> >>
> >>
> >> Client Configuration :
> >>
> >> volume brick-ds-local
> >> type protocol/client
> >> option transport-type tcp/client
> >> option remote-host 192.168.16.128 <http://192.168.16.128>
> # 192.168.16.1 <http://192.168.16.1> on the other machine
> >> option remote-subvolume brick-ds
> >> end-volume
> >>
> >> volume brick-ds-remote
> >> type protocol/client
> >> option transport-type tcp/client
> >> option remote-host 192.168.16.1 <http://192.168.16.1> #
> 192.168.16.128 <http://192.168.16.128> on the other machine
> >> option remote-subvolume brick-ds-afr
> >> end-volume
> >>
> >> volume brick-ds-afr
> >> type cluster/afr
> >> subvolumes brick-ds-local brick-ds-remote
> >> option replicate *:2
> >> end-volume
> >>
> >> These are both stock CentOS/RHEL 5 machines. You can
> demonstrate the
> >> behavior by rebooting one machine, pulling out the ethernet
> cable, or
> >> sending the route out into space (ie route add -host
> 192.168.16.1 <http://192.168.16.1>
> >> some_disconnected_device). Everything will be frozen until the
> connection
> >> returns and then when it comes back up, things keep working
> again after
> >> that.
> >>
> >> Because of this problem, any kind of HA / unify setup will not
> work for me
> >> when one of the nodes fails.
> >>
> >> Can someone else verify this behavior? If there is some part of
> the logs /
> >> strace / gdb output you'd like to see , just let me know. I'd
> really like to
> >> use glusterfs in an HA setup, but don't see how with this behavior.
> >>
> >> Thanks in advance!!
> >> :august
> >>
> >>
> >> On 9/7/07, August R. Wohlt < glusterfs at isidore.net
> <mailto:glusterfs at isidore.net>> wrote:
> >>
> >>> Hi all -
> >>>
> >>> I have a setup based on this :
> >>>
> >>>
> >> http://www.gluster.org/docs/index.php/GlusterFS_High_Availability_Storage_with_GlusterFS
> <http://www.gluster.org/docs/index.php/GlusterFS_High_Availability_Storage_with_GlusterFS>
> >>
> >>> but with only 2 machines. Effectively just a mirror (glusterfsd
> >>>
> >> configuration below). 1.3.1 client and server.
> >>
> >>>
> >>
> >
> >
> > _______________________________________________
> > Gluster-devel mailing list
> > Gluster-devel at nongnu.org <mailto:Gluster-devel at nongnu.org>
> > http://lists.nongnu.org/mailman/listinfo/gluster-devel
> >
>
>
>
> _______________________________________________
> Gluster-devel mailing list
> Gluster-devel at nongnu.org <mailto:Gluster-devel at nongnu.org>
> http://lists.nongnu.org/mailman/listinfo/gluster-devel
>
>
>
>
> --
> Amar Tumballi
> Engineer - Gluster Core Team
> [bulde on #gluster/irc.gnu.org]
> http://www.zresearch.com - Commoditizing Supercomputing and Superstorage!
More information about the Gluster-devel
mailing list