perrynzhou

专注于系统组件研发

0%

选项说明

1
$ gluster volume set help
选项 默认值 说明
changelog.changelog-barrier-timeout 120 After ‘timeout’ seconds since the time ‘barrier’ option was set to “on”, unlink/rmdir/rename operations are no longer blocked and previously blocked fops are allowed to go through
cluster.enable-shared-storage disable Create and mount the shared storage volume(gluster_shared_storage) at /var/run/gluster/shared_storage on enabling this option. Unmount and delete the shared storage volume on disabling this option.
cluster.write-freq-threshold 0 Defines the number of writes, in a promotion/demotion cycle, that would mark a file HOT for promotion. Any file that has write hits less than this value will be considered as COLD and will be demoted.
cluster.read-freq-threshold 0 Defines the number of reads, in a promotion/demotion cycle, that would mark a file HOT for promotion. Any file that has read hits less than this value will be considered as COLD and will be demoted.
cluster.tier-pause off (null)
cluster.tier-promote-frequency 120 Frequency to promote files to fast tier
cluster.tier-demote-frequency 120 Frequency to demote files to slow tier
cluster.watermark-hi 90 Upper % watermark for promotion. If hot tier fills above this percentage, no promotion will happen and demotion will happen with high probability.
cluster.watermark-low 75 Lower % watermark. If hot tier is less full than this, promotion will happen and demotion will not happen. If greater than this, promotion/demotion will happen at a probability relative to how full the hot tier is.
cluster.tier-mode cache Either ‘test’ or ‘cache’. Test mode periodically demotes or promotes files automatically based on access. Cache mode does so based on whether the cache is full or not, as specified with watermarks.
cluster.tier-max-mb 10000 The maximum number of MB that may be migrated in any direction in a given cycle by a single node.
cluster.tier-max-files 50000 The maximum number of files that may be migrated in any direction in a given cycle by a single node.
cluster.lookup-unhashed on This option if set to ON, does a lookup through all the sub-volumes, in case a lookup didn’t return any result from the hash subvolume. If set to OFF, it does not do a lookup on the remaining subvolumes.
cluster.lookup-optimize off This option if set to ON enables the optimization of -ve lookups, by not doing a lookup on non-hashed subvolumes for files, in case the hashed subvolume does not return any result. This option disregards the lookup-unhashed setting, when enabled.
cluster.min-free-disk 10% Percentage/Size of disk space, after which the process starts balancing out the cluster, and logs will appear in log files
cluster.min-free-inodes 5% after system has only N% of inodes, warnings starts to appear in log files
cluster.rebalance-stats off This option if set to ON displays and logs the time taken for migration of each file, during the rebalance process. If set to OFF, the rebalance logs will only display the time spent in each directory.
cluster.subvols-per-directory null Specifies the directory layout spread. Takes number of subvolumes as default value.
cluster.readdir-optimize off This option if set to ON enables the optimization that allows DHT to requests non-first subvolumes to filter out directory entries.
cluster.rebal-throttle normal Sets the maximum number of parallel file migrations allowed on a node during the rebalance operation. The default value is normal and allows a max of [($(processing units) - 4) / 2), 2] files to be migrated at a time. Lazy will allow only one file to be migrated at a time and aggressive will allow max of [($(processing units) - 4) / 2), 4]
cluster.weighted-rebalance on When enabled, files will be allocated to bricks with a probability proportional to their size. Otherwise, all bricks will have the same probability (legacy behavior).
cluster.entry-change-log on Entry fops like create/unlink will not perform pre/post fop changelog operations in afr transaction if this option is disabled
cluster.read-subvolume null inode-read fops happen only on one of the bricks in replicate. Afr will prefer the one specified using this option if it is not stale. Option value must be one of the xlator names of the children. Ex: -client-0 till -client-<number-of-bricks - 1>
cluster.read-subvolume-index -1 inode-read fops happen only on one of the bricks in replicate. AFR will prefer the one specified using this option if it is not stale. allowed options include -1 till replica-count - 1
cluster.read-hash-mode 1 inode-read fops happen only on one of the bricks in replicate. AFR will prefer the one computed using the method specified using this option0 = first up server, 1 = hash by GFID of file (all clients use same subvolume), 2 = hash by GFID of file and client PID
cluster.background-self-heal-count 16 This specifies the number of self-heals that can be performed in background without blocking the fop
cluster.metadata-self-heal on Using this option we can enable/disable metadata i.e. Permissions, ownerships, xattrs self-heal on the file/directory.
cluster.data-self-heal on Using this option we can enable/disable data self-heal on the file. “open” means data self-heal action will only be triggered by file open operations.
cluster.entry-self-heal on Using this option we can enable/disable entry self-heal on the directory.
cluster.self-heal-daemon on This option applies to only self-heal-daemon. Index directory crawl and automatic healing of files will not be performed if this option is turned off.
cluster.heal-timeout 600 time interval for checking the need to self-heal in self-heal-daemon
cluster.self-heal-window-size 1 Maximum number blocks per file for which self-heal process would be applied simultaneously.
cluster.data-change-log on Data fops like write/truncate will not perform pre/post fop changelog operations in afr transaction if this option is disabled
cluster.metadata-change-log on Metadata fops like setattr/setxattr will not perform pre/post fop changelog operations in afr transaction if this option is disabled
cluster.data-self-heal-algorithm null Select between “full”, “diff”. The “full” algorithm copies the entire file from source to sink. The “diff” algorithm copies to sink only those blocks whose checksums don’t match with those of source. If no option is configured the option is chosen dynamically as follows: If the file does not exist on one of the sinks or empty file exists or if the source file size is about the same as page size the entire file will be read and written i.e “full” algo, otherwise “diff” algo is chosen.
cluster.eager-lock on Lock phase of a transaction has two sub-phases. First is an attempt to acquire locks in parallel by broadcasting non-blocking lock requests. If lock acquisition fails on any server, then the held locks are unlocked and revert to a blocking locked mode sequentially on one server after another. If this option is enabled the initial broadcasting lock request attempt to acquire lock on the entire file. If this fails, we revert back to the sequential “regional” blocking lock as before. In the case where such an “eager” lock is granted in the non-blocking phase, it gives rise to an opportunity for optimization. i.e, if the next write transaction on the same FD arrives before the unlock phase of the first transaction, it “takes over” the full file lock. Similarly if yet another data transaction arrives before the unlock phase of the “optimized” transaction, that in turn “takes over” the lock as well. The actual unlock now happens at the end of the last “optimized” transaction.
cluster.quorum-type none If value is “fixed” only allow writes if quorum-count bricks are present. If value is “auto” only allow writes if more than half of bricks, or exactly half including the first, are present.
cluster.quorum-count null If quorum-type is “fixed” only allow writes if this many bricks or present. Other quorum types will OVERWRITE this value.
cluster.choose-local true Choose a local subvolume (i.e. Brick) to read from if read-subvolume is not explicitly set.
cluster.self-heal-readdir-size 1KB readdirp size for performing entry self-heal
cluster.ensure-durability on Afr performs fsyncs for transactions if this option is on to make sure the changelogs/data is written to the disk
cluster.consistent-metadata no If this option is enabled, readdirp will force lookups on those entries read whose read child is not the same as that of the parent. This will guarantee that all read operations on a file serve attributes from the same subvol as long as it holds a good copy of the file/dir.
cluster.stripe-block-size 128KB Size of the stripe unit that would be read from or written to the striped servers.
cluster.stripe-coalesce true Enable/Disable coalesce mode to flatten striped files as stored on the server (i.e., eliminate holes caused by the traditional format).
cluster.server-quorum-type (null) This feature is on the server-side i.e. in glusterd. Whenever the glusterd on a machine observes that the quorum is not met, it brings down the bricks to prevent data split-brains. When the network connections are brought back up and the quorum is restored the bricks in the volume are brought back up.
cluster.server-quorum-ratio (null) Sets the quorum percentage for the trusted storage pool.
cluster.quorum-reads no If quorum-reads is “true” only allow reads if quorum is met when quorum is enabled.
diagnostics.latency-measurement off If on stats related to the latency of each operation would be tracked inside GlusterFS data-structures.
diagnostics.dump-fd-stats off If on stats related to file-operations would be tracked inside GlusterFS data-structures.
diagnostics.brick-log-level INFO Changes the log-level of the bricks
diagnostics.client-log-level INFO Changes the log-level of the clients
diagnostics.brick-sys-log-level CRITICAL Gluster’s syslog log-level
diagnostics.client-sys-log-level CRITICAL Gluster’s syslog log-level
diagnostics.brick-logger null null
diagnostics.client-logger null null
diagnostics.brick-log-format null null
diagnostics.client-log-format null null
diagnostics.brick-log-buf-size 5 null
diagnostics.client-log-buf-size 5 null
diagnostics.brick-log-flush-timeout 20 null
diagnostics.client-log-flush-timeout 20 null
disperse.background-heals 8 This option can be used to control number of parallel heals
disperse.heal-wait-qlength 128 This option can be used to control number of heals that can wait
disperse.read-policy round-robin inode-read fops happen only on ‘k’ number of bricks in n=k+m disperse subvolume. ‘round-robin’ selects the read subvolume using round-robin algo. ‘gfid-hash’ selects read subvolume based on hash of the gfid of that file/directory.
dht.force-readdirp on This option if set to ON, forces the use of readdirp, and hence also displays the stats of the files.
performance.cache-max-file-size 0 Maximum file size which would be cached by the io-cache translator.
performance.cache-min-file-size 0 Minimum file size which would be cached by the io-cache translator.
performance.cache-refresh-timeout 1 The cached data for a file will be retained till ‘cache-refresh-timeout’ seconds, after which data re-validation is performed.
performance.cache-priority Assigns priority to filenames with specific patterns so that when a page needs to be ejected out of the cache, the page of a file whose priority is the lowest will be ejected earlier
performance.cache-size 32MB Size of the read cache.
performance.io-thread-count 16 Number of threads in IO threads translator which perform concurrent IO operations
performance.high-prio-threads 16 Max number of threads in IO threads translator which perform high priority IO operations at a given time
performance.normal-prio-threads 16 Max number of threads in IO threads translator which perform normal priority IO operations at a given time
performance.low-prio-threads 16 Max number of threads in IO threads translator which perform low priority IO operations at a given time
performance.least-prio-threads 1 Max number of threads in IO threads translator which perform least priority IO operations at a given time
performance.enable-least-priority on Enable/Disable least priority
performance.least-rate-limit 0 Max number of least priority operations to handle per-second
performance.flush-behind on If this option is set ON, instructs write-behind translator to perform flush in background, by returning success (or any errors, if any of previous writes were failed) to application even before flush FOP is sent to backend filesystem.
performance.nfs.flush-behind on If this option is set ON, instructs write-behind translator to perform flush in background, by returning success (or any errors, if any of previous writes were failed) to application even before flush FOP is sent to backend filesystem.
performance.write-behind-window-size 1MB Size of the write-behind buffer for a single file (inode).
performance.nfs.write-behind-window-size 1MB Size of the write-behind buffer for a single file (inode).
performance.strict-o-direct off This option when set to off, ignores the O_DIRECT flag.
performance.nfs.strict-o-direct off This option when set to off, ignores the O_DIRECT flag.
performance.strict-write-ordering off Do not let later writes overtake earlier writes even if they do not overlap
performance.nfs.strict-write-ordering off Do not let later writes overtake earlier writes even if they do not overlap
performance.lazy-open yes Perform open in the backend only when a necessary FOP arrives (e.g writev on the FD, unlink of the file). When option is disabled, perform backend open right after unwinding open().
performance.read-after-open no read is sent only after actual open happens and real fd is obtained, instead of doing on anonymous fd (similar to write)
performance.read-ahead-page-count 4 Number of pages that will be pre-fetched
performance.md-cache-timeout 1 Time period after which cache has to be refreshed
performance.write-behind on enable/disable write-behind translator in the volume.
performance.read-ahead on enable/disable read-ahead translator in the volume.
performance.readdir-ahead off enable/disable readdir-ahead translator in the volume.
performance.io-cache on enable/disable io-cache translator in the volume.
performance.quick-read on enable/disable quick-read translator in the volume.
performance.open-behind on enable/disable open-behind translator in the volume.
performance.stat-prefetch on enable/disable meta-data caching translator in the volume.
performance.client-io-threads off enable/disable io-threads translator in the client graph of volume.
performance.nfs.write-behind on enable/disable write-behind translator in the volume
performance.force-readdirp true Convert all readdir requests to readdirplus to collect stat info on each entry.
features.encryption off enable/disable client-side encryption for the volume.
encryption.master-key null Pathname of regular file which contains master volume key
encryption.data-key-size 256 Data key size (bits)
encryption.block-size 4096 Atom size (bits)
nfs.enable-ino32 no For nfs clients or apps that do not support 64-bit inode numbers, use this option to make NFS return 32-bit inode numbers instead. Disabled by default, so NFS returns 64-bit inode numbers.
nfs.mem-factor 15 Use this option to make NFS be faster on systems by using more memory. This option specifies a multiple that determines the total amount of memory used. Default value is 15. Increase to use more memory in order to improve performance for certain use cases.Please consult gluster-users list before using this option.
nfs.export-dirs on By default, all subvolumes of nfs are exported as individual exports. There are cases where a subdirectory or subdirectories in the volume need to be exported separately. Enabling this option allows any directory on a volumes to be exported separately.Directory exports are enabled by default.
nfs.export-volumes on Enable or disable exporting whole volumes, instead if used in conjunction with nfs3.export-dir, can allow setting up only subdirectories as exports. On by default.
nfs.addr-namelookup off Users have the option of turning on name lookup for incoming client connections using this option. Use this option to turn on name lookups during address-based authentication. Turning this on will enable you to use hostnames in nfs.rpc-auth-* filters. In some setups, the name server can take too long to reply to DNS queries resulting in timeouts of mount requests. By default, name lookup is off
nfs.dynamic-volumes off Internal option set to tell gnfs to use a different scheme for encoding file handles when DVM is being used.
nfs.register-with-portmap on For systems that need to run multiple nfs servers, only one registration is possible with portmap service. Use this option to turn off portmap registration for Gluster NFS. On by default
nfs.outstanding-rpc-limit 16 Parameter to throttle the number of incoming RPC requests from a client. 0 means no limit (can potentially run out of memory)
nfs.port 2049 Use this option on systems that need Gluster NFS to be associated with a non-default port number.
nfs.rpc-auth-unix on Disable or enable the AUTH_UNIX authentication type for a particular exported volume overriding defaults and general setting for AUTH_UNIX scheme. Must always be enabled for better interoperability. However, can be disabled if needed. Enabled by default.
nfs.rpc-auth-null on Disable or enable the AUTH_NULL authentication type for a particular exported volume overriding defaults and general setting for AUTH_NULL. Must always be enabled. This option is here only to avoid unrecognized option warnings.
nfs.rpc-auth-allow all Allow a comma separated list of addresses and/or hostnames to connect to the server. By default, all connections are allowed. This allows users to define a rule for a specific exported volume.
nfs.rpc-auth-reject none Reject a comma separated list of addresses and/or hostnames from connecting to the server. By default, all connections are allowed. This allows users to define a rule for a specific exported volume.
nfs.ports-insecure off Allow client connections from unprivileged ports. By default only privileged ports are allowed. Use this option to enable or disable insecure ports for a specific subvolume and to override the global setting set by the previous option.
nfs.transport-type (null) Specifies the nfs transport type. Valid transport types are ‘tcp’ and ‘rdma’.
nfs.trusted-sync off All writes and COMMIT requests are treated as async. This implies that no write requests are guaranteed to be on server disks when the write reply is received at the NFS client. Trusted sync includes trusted-write behaviour. Off by default.
nfs.trusted-write off On an UNSTABLE write from client, return STABLE flag to force client to not send a COMMIT request. In some environments, combined with a replicated GlusterFS setup, this option can improve write performance. This flag allows user to trust Gluster replication logic to sync data to the disks and recover when required. COMMIT requests if received will be handled in a default manner by fsyncing. STABLE writes are still handled in a sync manner. Off by default.
nfs.volume-access read-write Type of access desired for this subvolume: read-only, read-write(default)
nfs.export-dir By default, all subvolumes of nfs are exported as individual exports. There are cases where a subdirectory or subdirectories in the volume need to be exported separately. This option can also be used in conjunction with nfs3.export-volumes option to restrict exports only to the subdirectories specified through this option. Must be an absolute path. Along with path allowed list of IPs/hostname can be associated with each subdirectory. If provided connection will allowed only from these IPs. By default connections from all IPs are allowed. Format:[(hostspec[|hostspec|…])][,…]. Where hostspec can be an IP address, hostname or an IP range in CIDR notation. e.g. /foo(192.168.1.0/24|host1|10.1.1.8),/host2. NOTE: Care must be taken while configuring this option as invalid entries and/or unreachable DNS servers can introduce unwanted delay in all the mount calls.
nfs.disable false This option is used to start or stop the NFS server for individual volumes.
nfs.nlm on This option, if set to ‘off’, disables NLM server by not registering the service with the portmapper. Set it to ‘on’ to re-enable it. Default value: ‘on’
nfs.acl on This option is used to control ACL support for NFS.
nfs.mount-udp off set the option to ‘on’ to enable mountd on UDP. Required for some Solaris and AIX NFS clients. The need for enabling this option often depends on the usage of NLM.
nfs.mount-rmtab /var/lib/glusterd/nfs/rmtab Set the location of the cache file that is used to list all the NFS-clients that have connected through the MOUNT protocol. If this is on shared storage, all GlusterFS servers will update and output (with ‘showmount’) the same list. Set to “/-“ to disable.
nfs.drc off Enable Duplicate Request Cache in gNFS server to improve correctness of non-idempotent operations like write, delete, link, et al
nfs.drc-size 0x20000 Sets the number of non-idempotent requests to cache in drc
nfs.read-size (1 * 1048576ULL) Size in which the client should issue read requests to the Gluster NFSv3 server. Must be a multiple of 4KB (4096). Min and Max supported values are 4KB (4096) and 1MB (1048576) respectively. If the specified value is within the supported range but not a multiple of 4096, it is rounded up to the nearest multiple of 4096.
nfs.write-size (1 * 1048576ULL) Size in which the client should issue write requests to the Gluster NFSv3 server. Must be a multiple of 1KB (1024). Min and Max supported values are 4KB (4096) and 1MB(1048576) respectively. If the specified value is within the supported range but not a multiple of 4096, it is rounded up to the nearest multiple of 4096.
nfs.readdir-size (1 * 1048576ULL) Size in which the client should issue directory reading requests to the Gluster NFSv3 server. Must be a multiple of 1KB (1024). Min and Max supported values are 4KB (4096) and 1MB (1048576) respectively.If the specified value is within the supported range but not a multiple of 4096, it is rounded up to the nearest multiple of 4096.
nfs.exports-auth-enable (null) Set the option to ‘on’ to enable exports/netgroup authentication in the NFS server and mount daemon.
nfs.auth-refresh-interval-sec (null) Frequency in seconds that the daemon should check for changes in the exports/netgroups file.
nfs.auth-cache-ttl-sec (null) Sets the TTL of an entry in the auth cache. Value is in seconds.
ganesha.enable off export volume via NFS-Ganesha
network.frame-timeout 1800 Time frame after which the (file) operation would be declared as dead, if the server does not respond for a particular (file) operation.
network.ping-timeout 42 Time duration for which the client waits to check if the server is responsive.
network.tcp-window-size null Specifies the window size for tcp socket.
network.remote-dio disable If enabled, in open() and creat() calls, O_DIRECT flag will be filtered at the client protocol level so server will still continue to cache the file. This works similar to NFS’s behavior of O_DIRECT
network.inode-lru-limit 16384 Specifies the maximum megabytes of memory to be used in the inode cache.
network.compression off enable/disable network compression translator
network.compression.window-size -15 Size of the zlib history buffer.
network.compression.mem-level 8 Memory allocated for internal compression state. 1 uses minimum memory but is slow and reduces compression ratio; memLevel=9 uses maximum memory for optimal speed. The default value is 8.
network.compression.min-size 0 Data is compressed only when its size exceeds this.
network.compression.compression-level -1 Compression levels 0 : no compression, 1 : best speed, 9 : best compression, -1 : default compression
features.lock-heal off When the connection to client is lost, server cleans up all the locks held by the client. After the connection is restored, the client reacquires (heals) the fcntl locks released by the server.
features.grace-timeout 10 Specifies the duration for the lock state to be maintained on the client after a network disconnection. Range 10-1800 seconds.
features.file-snapshot off enable/disable file-snapshot feature in the volume.
features.uss off enable/disable User Serviceable Snapshots on the volume.
features.snapshot-directory .snaps Entry point directory for entering snapshot world
features.show-snapshot-directory off show entry point in readdir output of snapdir-entry-path which is set by samba
features.quota-deem-statfs off If set to on, it takes quota limits intoconsideration while estimating fs size. (df command) (Default is off).
features.read-only off When “on”, makes a volume read-only. It is turned “off” by default.
features.worm off When “on”, makes a volume get write once read many feature. It is turned “off” by default.
features.barrier-timeout 120 After ‘timeout’ seconds since the time ‘barrier’ option was set to “on”, acknowledgements to file operations are no longer blocked and previously blocked acknowledgements are sent to the application
features.trash off Enable/disable trash translator
features.trash-dir .trashcan Directory for trash files
features.trash-eliminate-path (null) Eliminate paths to be excluded from trashing
features.trash-max-filesize 5MB Maximum size of file that can be moved to trash
features.trash-internal-op off Enable/disable trash translator for internal operations
features.ctr-enabled off Enable CTR xlator
features.record-counters off Its a Change Time Recorder Xlator option to enable recording write and read heat counters. The default is disabled. If enabled, “cluster.write-freq-threshold” and “cluster.read-freq-threshold” defined the number of writes (or reads) to a given file are needed before triggering migration.
features.ctr-sql-db-cachesize 1000 Defines the cache size of the sqlite database of changetimerecorder xlator.The input to this option is in pages.Each page is 4096 bytes. Default value is 1000 pages i.e ~ 4 MB. The max value is 262144 pages i.e 1 GB and the min value is 1000 pages i.e ~ 4 MB.
features.ctr-sql-db-wal-autocheckpoint 1000 Defines the autocheckpoint of the sqlite database of changetimerecorder. The input to this option is in pages. Each page is 4096 bytes. Default value is 1000 pages i.e ~ 4 MB.The max value is 262144 pages i.e 1 GB and the min value is 1000 pages i.e ~4 MB.
features.shard-block-size 4MB The size unit used to break a file into multiple chunks
features.cache-invalidation off When “on”, sends cache-invalidation notifications.
features.cache-invalidation-timeout 60 After ‘timeout’ seconds since the time client accessed any file, cache-invalidation notifications are no longer sent to that client.
client.event-threads 2 Specifies the number of event threads to execute in parallel. Larger values would help process responses faster, depending on available processing power. Range 1-32 threads.
auth.allow null Allow a comma separated list of addresses and/or hostnames to connect to the server. Option auth.reject overrides this option. By default, all connections are allowed.
auth.reject null Reject a comma separated list of addresses and/or hostnames to connect to the server. This option overrides the auth.allow option. By default, all connections are allowed.
server.root-squash off Map requests from uid/gid 0 to the anonymous uid/gid. Note that this does not apply to any other uids or gids that might be equally sensitive, such as user bin or group staff.
server.anonuid 65534 value of the uid used for the anonymous user/nfsnobody when root-squash is enabled.
server.anongid 65534 value of the gid used for the anonymous user/nfsnobody when root-squash is enabled.
server.statedump-path /var/run/gluster Specifies directory in which gluster should save its statedumps.
server.outstanding-rpc-limit 64 Parameter to throttle the number of incoming RPC requests from a client. 0 means no limit (can potentially run out of memory)
server.manage-gids off Resolve groups on the server-side.
server.dynamic-auth on When ‘on’ perform dynamic authentication of volume options in order to allow/terminate client transport connection immediately in response to *.allow | *.reject volume set options.
server.gid-timeout 300 Timeout in seconds for the cached groups to expire.
server.event-threads 2 Specifies the number of event threads to execute in parallel. Larger values would help process responses faster, depending on available processing power. Range 1-32 threads.
ssl.own-cert null SSL certificate. Ignored if SSL is not enabled.
ssl.private-key null SSL private key. Ignored if SSL is not enabled.
ssl.ca-list null SSL CA list. Ignored if SSL is not enabled.
ssl.crl-path null Path to directory containing CRL. Ignored if SSL is not enabled.
ssl.certificate-depth null Maximum certificate-chain depth. If zero, the peer’s certificate itself must be in the local certificate list. Otherwise, there may be up to N signing certificates between the peer’s and the local list. Ignored if SSL is not enabled.
ssl.cipher-list null Allowed SSL ciphers. Ignored if SSL is not enabled.
ssl.dh-param null DH parameters file. Ignored if SSL is not enabled.
ssl.ec-curve null ECDH curve name. Ignored if SSL is not enabled.
storage.linux-aio off Support for native Linux AIO
storage.batch-fsync-mode reverse-fsync Possible values: syncfs: Perform one syncfs() on behalf oa batchof fsyncs. syncfs-single-fsync: Perform one syncfs() on behalf of a batch of fsyncs and one fsync() per batch. syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch of fsyncs and fsync() each file in the batch in reverse order. reverse-fsync: Perform fsync() of each file in the batch in reverse order.
storage.batch-fsync-delay-usec 0 Num of usecs to wait for aggregating fsync requests
storage.owner-uid -1 Support for setting uid of brick’s owner
storage.owner-gid -1 Support for setting gid of brick’s owner
storage.node-uuid-pathinfo off return glusterd’s node-uuid in pathinfo xattr string instead of hostname
storage.health-check-interval 30 Interval in seconds for a filesystem health check, set to 0 to disable
storage.build-pgfid off Enable placeholders for gfid to path conversion
storage.bd-aio off Support for native Linux AIO

基本GlusterFS的优化样例

选项 配置 说明
cluster.server-quorum-type server
cluster.quorum-type auto
network.remote-dio enable
cluster.eager-lock enable
performance.stat-prefetch off
performance.io-cache off
performance.read-ahead off
performance.quick-read off
performance.readdir-ahead

how redis do replication

  • step 1:add callback for server cron job
1
2
3
4
5
6
//location:server.c
//fucntion:setting callback for timeout
if (aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL) == AE_ERR) {
serverPanic("Can't create event loop timers.");
exit(1);
}
  • step 2:call replication for each 1000 seconds
1
2
3
//location:server.c
//function:serverCron,crontab for redis-server
run_with_period(1000) replicationCron();
  • step 3:core replication cron job

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    248
    249
    250
    251
    252
    253
    254
    255
    256
    257
    258
    259
    260
    261
    262
    263
    264
    265
    266
    267
    268
    269
    270
    271
    272
    273
    274
    275
    276
    277
    278
    279
    280
    281
    282
    283
    284
    285
    286
    287
    288
    289
    290
    291
    292
    293
    294
    295
    296
    297
    298
    299
    300
    301
    302
    303
    304
    305
    306
    307
    308
    309
    310
    311
    312
    313
    314
    315
    316
    317
    318
    319
    320
    321
    322
    323
    324
    325
    326
    //location:replication.c
    //function: response for redis replication
    /* Replication cron function, called 1 time per second. */
    void replicationCron(void) {


    /* Check if we should connect to a MASTER */
    if (server.repl_state == REPL_STATE_CONNECT) {
    if (connectWithMaster() == C_OK) {
    serverLog(LL_NOTICE,"MASTER <-> REPLICA sync started");
    }
    };

    }

    int connectWithMaster(void) {
    int fd;
    fd = anetTcpNonBlockBestEffortBindConnect(NULL,
    server.masterhost,server.masterport,NET_FIRST_BIND_ADDR);
    // fd of master socket
    aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL);

    }

    void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {

    /* Send a PING to check the master is able to reply without errors. */
    sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PING",NULL);

    /* Receive the PONG command. */
    sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);

    /* AUTH with the master if required. */
    sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH",server.masterauth,NULL);


    /* Receive AUTH reply. */
    sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);

    /* Set the slave port, so that Master's INFO command can list the
    * slave listening port correctly. */
    sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
    "listening-port",port, NULL);


    /* Receive REPLCONF listening-port reply. */
    sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);

    /* Set the slave ip, so that Master's INFO command can list the
    * slave IP address port correctly in case of port forwarding or NAT. */
    sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
    "ip-address",server.slave_announce_ip, NULL);

    /* Receive REPLCONF ip-address reply. */
    sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);

    /* Inform the master of our (slave) capabilities.
    *
    * EOF: supports EOF-style RDB transfer for diskless replication.
    * PSYNC2: supports PSYNC v2, so understands +CONTINUE <new repl ID>.
    *
    * The master will ignore capabilities it does not understand. */
    sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
    "capa","eof","capa","psync2",NULL);



    /* Receive CAPA reply. */
    sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);


    /* Try a partial resynchonization. If we don't have a cached master
    * slaveTryPartialResynchronization() will at least try to use PSYNC
    * to start a full resynchronization so that we get the master run id
    * and the global offset, to try a partial resync at the next
    * reconnection attempt. */
    slaveTryPartialResynchronization(fd,0) {
    /* If we reached this point, we are able to perform a partial resync:

    /* Issue the PSYNC command */
    reply = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PSYNC",psync_replid,psync_offset,NULL)
    {
    /* command of up,will call syncCommand of master node
    /* SYNC and PSYNC command implemenation. */

    // PSYNC command will call syncCommand function
    void syncCommand(client *c) {
    masterTryPartialResynchronization(c) {
    // write master replid to slave

    if (c->slave_capa & SLAVE_CAPA_PSYNC2) {
    buflen = snprintf(buf,sizeof(buf),"+CONTINUE %s\r\n", server.replid);
    } else {
    buflen = snprintf(buf,sizeof(buf),"+CONTINUE\r\n");
    }
    if (write(c->fd,buf,buflen) != buflen) {
    freeClientAsync(c);
    }

    * 1) Set client state to make it a slave.
    * 2) Inform the client we can continue with +CONTINUE
    * 3) Send the backlog data (from the offset to the end) to the slave. */
    c->flags |= CLIENT_SLAVE;
    c->replstate = SLAVE_STATE_ONLINE;
    c->repl_ack_time = server.unixtime;
    c->repl_put_online_on_ack = 0;
    listAddNodeTail(server.slaves,c);
    startBgsaveForReplication(c->slave_capa) {
    // proto
    //int startBgsaveForReplication(int mincapa)
    int retval;

    //socket_target init according slave_capa_eof
    int socket_target = server.repl_diskless_sync && (mincapa & SLAVE_CAPA_EOF);
    rdbSaveInfo rsi, *rsiptr;
    rsiptr = rdbPopulateSaveInfo(&rsi);

    if (rsiptr) {
    if (socket_target)
    // mark rio with end flag,and send to slave
    rdbSaveToSlavesSockets(rsiptr){
    if(fork()==0){
    CHILD_INFO_TYPE_RDB
    }
    }
    else
    // 1. bgsave current rdb.dump with background
    rdbSaveBackground(server.rdb_filename,rsiptr){
    if((childpid=fork())==0){
    // redis server main process will wait util save all database in disk for finish
    rdbSave(rdb_filename);
    // write CHILD_INFO_TYPE_RDB to parent process by pipe
    sendChildInfo(CHILD_INFO_TYPE_RDB)
    }else{
    server.rdb_child_pid = childpid;
    server.rdb_child_type = RDB_CHILD_TYPE_DISK;
    }
    }
    }
    copyClientOutputBuffer(c,slave);
    if(!socket_target) {
    // 2.write replid and offset to slave
    replicationSetupSlaveForFullResync(c,slave->psync_initial_offset)
    }
    }
    }
    }
    reply = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
    /* We are going to full resync, discard the cached master structure. */
    replicationDiscardCachedMaster();

    }

    psync_result = slaveTryPartialResynchronization(fd,1);


    /* PSYNC failed or is not supported: we want our slaves to resync with us
    * as well, if we have any sub-slaves. The master may transfer us an
    * entirely different data set and we have no way to incrementally feed
    * our slaves after that. */
    disconnectSlaves(); /* Force our slaves to resync with us as well. */
    freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */

    /* Fall back to SYNC if needed. Otherwise psync_result == PSYNC_FULLRESYNC
    * and the server.master_replid and master_initial_offset are
    * already populated. sync state,waited*/
    syncWrite(fd,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1);

    /* Prepare a suitable temp file for bulk transfer */
    while(maxtries--) {
    snprintf(tmpfile,256,
    "temp-%d.%ld.rdb",(int)server.unixtime,(long int)getpid());
    dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
    }


    /* Setup the non blocking download of the bulk file. */
    aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL);

    server.repl_state = REPL_STATE_TRANSFER;
    server.repl_transfer_size = -1;
    server.repl_transfer_read = 0;
    server.repl_transfer_last_fsync_off = 0;

    // current node as slave,open temp-rdb.dump file and get fd for this file
    server.repl_transfer_fd = dfd;
    server.repl_transfer_lastio = server.unixtime;
    server.repl_transfer_tmpfile = zstrdup(tmpfile);
    return;
    }

    void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
    char buf[4096];
    ssize_t nread, readlen, nwritten;

    /* If repl_transfer_size == -1 we still have to read the bulk length
    * from the master reply. */
    syncReadLine(fd,buf,1024,server.repl_syncio_timeout*1000) == -1) {


    /* Read bulk data */
    if (usemark) {
    readlen = sizeof(buf);
    } else {
    left = server.repl_transfer_size - server.repl_transfer_read;
    readlen = (left < (signed)sizeof(buf)) ? left : (signed)sizeof(buf);
    }

    // read rdb.dump from master
    nread = read(fd,buf,readlen);
    if (nread <= 0) {
    cancelReplicationHandshake();
    }

    // write to current temp-rdb.dump file
    nwritten = write(server.repl_transfer_fd,buf,nread))


    rename(server.repl_transfer_tmpfile,server.rdb_filename) ;

    /* We need to stop any AOFRW fork before flusing and parsing
    * RDB, otherwise we'll create a copy-on-write disaster. */
    if(aof_is_enabled) stopAppendOnly();

    //mark client as dirty,that will be remove
    signalFlushedDb(-1);

    // clean all data for current node
    emptyDb(-1,server.repl_slave_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS,replicationEmptyDbCallback)
    {
    // proto long long emptyDb(int dbnum, int flags, void(callback)(void*))
    int async = (flags & EMPTYDB_ASYNC);
    long long removed = 0;
    int startdb = 0;
    int enddb = server.dbnum-1;

    for (int j = startdb; j <= enddb; j++) {
    removed += dictSize(server.db[j].dict);
    //async instand for as backgroup job to flush data in database
    if (async) {
    // that will create new dict after async,old dict mark to clean
    emptyDbAsync(&server.db[j]);
    } else {
    //blocking for flush all data in database
    dictEmpty(server.db[j].dict,callback);
    dictEmpty(server.db[j].expires,callback);
    }
    }


    /* Before loading the DB into memory we need to delete the readable
    * handler, otherwise it will get called recursively since
    * rdbLoad() will call the event loop to process events from time to
    * time for non blocking loading. */
    aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE);

    //init slave info
    rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;

    //load data from rdb_file
    rdbLoad(server.rdb_filename,&rsi);


    // create master client for slave,that will got master socket and other info
    // incude replid of master,server.replid
    replicationCreateMasterClient(server.repl_transfer_s,rsi.repl_stream_db);
    server.repl_state = REPL_STATE_CONNECTED;

    // clean old replid,that included in server.replid2
    clearReplicationId2();

    /* Let's create the replication backlog if needed. Slaves need to
    * accumulate the backlog regardless of the fact they have sub-slaves
    * or not, in order to behave correctly if they are promoted to
    * masters after a failover. */
    if (server.repl_backlog == NULL) createReplicationBacklog();


    /* Restart the AOF subsystem now that we finished the sync. This
    * will trigger an AOF rewrite, and when done will start appending
    * to the new file. */
    if (aof_is_enabled) restartAOF();
    }
    return;

    }

    * On success the fuction returns the number of keys removed from the
    * database(s). Otherwise -1 is returned in the specific case the
    * DB number is out of range, and errno is set to EINVAL. */
    long long emptyDb(int dbnum, int flags, void(callback)(void*)) {
    int async = (flags & EMPTYDB_ASYNC);
    long long removed = 0;

    if (dbnum < -1 || dbnum >= server.dbnum) {
    errno = EINVAL;
    return -1;
    }

    int startdb, enddb;
    if (dbnum == -1) {
    startdb = 0;
    enddb = server.dbnum-1;
    } else {
    startdb = enddb = dbnum;
    }

    for (int j = startdb; j <= enddb; j++) {
    removed += dictSize(server.db[j].dict);
    if (async) {
    emptyDbAsync(&server.db[j]);
    } else {
    dictEmpty(server.db[j].dict,callback);
    dictEmpty(server.db[j].expires,callback);
    }
    }
    if (server.cluster_enabled) {
    if (async) {
    slotToKeyFlushAsync();
    } else {
    slotToKeyFlush();
    }
    }
    if (dbnum == -1) flushSlaveKeysWithExpireList();
    return removed;
    }
  • step4 wait for child process

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
//
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData)
{
int statloc
if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
int exitcode = WEXITSTATUS(statloc);
int bysignal = 0;

if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);

if (pid == -1) {
// error output
} else if (pid == server.rdb_child_pid) {
//each do bgsave ,server will store child pid
backgroundSaveDoneHandler(exitcode,bysignal)
{
switch(server.rdb_child_type) {
case RDB_CHILD_TYPE_DISK:
backgroundSaveDoneHandlerDisk(exitcode,bysignal)
{
updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_DISK){
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
//
} else if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) {
if (type == RDB_CHILD_TYPE_SOCKET) {
// update slave info
slave->replstate = SLAVE_STATE_ONLINE;
slave->repl_put_online_on_ack = 1;
slave->repl_ack_time = server.unixtime; /* Timeout otherwise. *
}else{
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
//sendBulkToSlave:send maser rdb.dump to every slave
aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave);
}
}
break;
case RDB_CHILD_TYPE_SOCKET:
backgroundSaveDoneHandlerSocket(exitcode,bysignal)
{
server.rdb_child_type = RDB_CHILD_TYPE_NONE;
updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_SOCKET);
}
break;
}
}
//read child data from parent,and check data
if (!bysignal && exitcode == 0) receiveChildInfo();
} else if (pid == server.aof_child_pid) {
backgroundRewriteDoneHandler(exitcode,bysignal);

//read child data from parent,and check data
if (!bysignal && exitcode == 0) receiveChildInfo();
} else {
if (!ldbRemoveChild(pid)) {
serverLog(LL_WARNING,
"Warning, detected child with unmatched pid: %ld",
(long)pid);
}
}
}

dup

  • 函数原型是 int dup(old_fd),把old_fd下标中的内容拷贝到当前进程文件描述符表中最小的可用位置下标空间中,open系统调用会默认返回当前进程描述符表中最小的下标作为文件描述符.dup系统调用不是原子的

    dup2

  • 函数原型是 int dup2(new_fd,old_fd),这个操作是原子的。如果old_fd已经存在就close(old_fd),然后调用dup(new_fd),把new_fd中内容拷贝到当前进程文件描述符表中最小的下标空间中。
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    #include <fcntl.h>
    #include <stdio.h>
    #include <string.h>
    #include <unistd.h>
    #define FILE_NAME "/tmp/out"
    static void redirect_stdout_without_dup() {
    fprintf(stdout, "pid=%d\n", getpid());
    const char *str = "my dup\n";
    //关闭 stdout
    close(1);
    //当前process中描述符表中最小可用的下标是1,因为刚刚关闭
    int fd = open(FILE_NAME, O_RDWR | O_CREAT | O_TRUNC, 0666);
    if (fd > 0) {
    // stdout 在每个进程描述表中的下标为1
    //此时,数据是写到了刚刚打开的fd中,新打开的fd返回的是1
    fprintf(stdout, " open fd=%d\n", fd);
    // write 操作也是写到fd=1中,当前进程中文件描述符为1的并不是标准输出
    write(fd, str, strlen(str));
    close(1);
    }
    }

    static void redirect_stdout_with_dup() {
    fprintf(stdout, "pid=%d\n", getpid());
    const char *str = "my dup";
    //默认打开fd,在当前进程描述表中fd并不是{0,1,2}
    int fd = open(FILE_NAME, O_RDWR | O_CREAT | O_TRUNC, 0666);
    if (fd > 0) {
    //关闭标准的输出的文件描述符
    close(1);
    //拷贝fd到当前进程描述符中最小的下标位置,当前最小的下标应该是刚刚关闭的1
    dup(fd);
    // fprintf的内容写入到了fd中,并没有写入到标准输出中
    fprintf(stdout, " open fd=%d\n", fd);
    write(fd, str, strlen(str));
    //关闭当前文件描述符
    close(fd);
    }
    }
    static void redirect_stdout_with_dup2() {
    fprintf(stdout, "pid=%d\n", getpid());
    const char *str = "i'm dup2\n";
    //打开一个新的文件描述符
    int fd = open(FILE_NAME, O_RDWR | O_CREAT | O_TRUNC, 0666);
    if (fd > 0) {
    //如果1号文件描述符是打开状态,就关闭1号文件描述符
    //把当前进程中文件描述符表中下标为fd的指针拷贝下标为1的空间
    //如果fd==1就直接返回fd
    dup2(fd, 1); // equals: close(1) and dup(fd)
    // fd和1号文件描述符指向相同的文件结构体指针
    fprintf(stdout, "%d already redirect to stdout\n", fd);
    write(fd, str, strlen(str));
    //刷盘操作
    if (fd != 1) {
    close(fd);
    }
    }
    }
    int main(void) {
    /*
    redirect_stdout_without_dup();
    redirect_stdout_with_dup();
    */
    redirect_stdout_with_dup2();
    for (;;) {
    sleep(1);
    }
    return 0;
    }

结论

  • 从上图的信息来看,标准输出1 被重定向到/tmp/out.lsof可以看出当前进程文件描述符表中打开的文件描述符。

fork 系统调用

Description

  • fork() creates a new process by duplicating the calling process. The new process, referred to as the child, is an exact duplicate of the calling process

    代码例子

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    /*************************************************************************
    > File Name: fork0.c
    > Author:perrynzhou
    > Mail:perrynzhou@gmail.com
    > Created Time: Tue 11 Jun 2019 10:27:20 AM CST
    ************************************************************************/

    #include <stdio.h>
    #include <stdlib.h>
    #include <stdbool.h>
    #include <ctype.h>
    #include <unistd.h>
    #include <getopt.h>
    void usage(char *s)
    {
    printf("%s -f {0 | 1}\n", s);
    printf(" 1 -- force flush all opened stream before fork\n");
    printf(" 0 -- not flush all opened stream before fork\n");
    }
    bool is_number(const char *s)
    {
    if (s == NULL)
    {
    return false;
    }
    while (*s != '\0')
    {
    if (isdigit(*s++) == 0)
    {
    return false;
    }
    }
    return true;
    }
    int main(int argc, char *argv[])
    {
    if (argc != 3)
    {
    usage(argv[0]);
    exit(0);
    }
    const char *cmd_parse_fmt = "f:";
    char ch;
    int force_flush_flag = 0;
    while ((ch = getopt(argc, argv, cmd_parse_fmt)) != -1)
    {
    switch (ch)
    {
    case 'f':
    if (!is_number(optarg))
    {
    force_flush_flag = -1;
    }
    else
    {
    force_flush_flag = atoi(optarg);
    }
    break;
    default:
    break;
    }
    }
    if (force_flush_flag > 1 || force_flush_flag < 0)
    {
    usage(argv[0]);
    exit(0);
    }
    printf("flush opened stream :%d\n", force_flush_flag);

    printf("[%ld]begin\n", getpid());
    pid_t pid;
    if (force_flush_flag == 1)
    {
    fflush(NULL);
    }
    if ((pid = fork()) < 0)
    {
    perror("fork");
    exit(0);
    }
    else if (pid == 0)
    {
    printf("[%ld] child process working\n", getpid());
    exit(0);
    }
    else
    {
    printf("[%ld] parent process working\n", getpid());
    }
    return 0;
    }

标准输出

  • 标准输出 红色标记的都是输出一个。

文件输出

  • 文件输出 f=1和f=0 结果完全不同,这是为什么?

标准输出和文件输出

  • 标准输入输出是行缓冲模式(\n,强制输出),在终端执行fork之前的printf,都是执行一次。
  • 文件输入输出是全缓冲模式(buffer到了指定的size,才会输出)
  • 当执行 test_fork -f 0重定向文件 时候,父进程执行fork之后,父子进程的地址空间是cow,没有任何改动的前提下,都是执行同一个地址空间。此时,父子进程执行的缓冲区都是一模一样,所以可以看到父子进程都会执行fork之前的printf.所以文件看到的是2个”flush opened stream :0”和”[92741]begin”.
  • 当f=1,重定向文件时候可以看到,父子进程仅仅执行一次 “flush opened stream :1”和”[92741]begin”.原因在于f=1的时候会在父进程fork之前执行fflush(NULL),此时执行这个函数目的在于强制flush所有的已经打开的stream.

0.存在的疑惑
  • glusterd启动后怎么把glusterfsd的brick进程拉起来?
  • glusterfsd启动的参数从哪里获取?
  • glusterfsd如何加载的呢?
  • glusterfsd和glusterd启动后各自都是什么角色?(后续再讨论)
    1. glusterfs的入口函数解释
  • glusterfsd中main方法加载xlator {name = 0x687dd0 “management”, type = 0x687f50 “mgmt/glusterd”},然后fork一个子进程加载mgmt/glusterd中的init方法
    2.glusterd启动过程
  • 子进程中调用mgmt/glusterd对应源文件中glusterd.c中的init,针对glusterd进行初始化,依次会调用如下方法:
    • glusterfs_volumes_init
    • glusterd.c:init
    • glusterd_spawn_daemons
    • glusterd_restart_bricks
    • glusterd_brick_start
    • glusterd_volume_start_glusterfs
    • runner_run
    • runner_start:fork子进程,调用execvp加载/usr/local/sbin/glusterfsd二进制文件
      1
      2
      3
      4
      5
      6
       int runner_start(runner_t *runner)
      {
      //-----忽略----------------
      execvp(runner->argv[0], runner->argv);
      //-----忽略----------------
      }
      3.glusterfsd如何加载xlator
  • glusterfsd是通过读取配置文件/var/lib/glusterd/vols/dht_vol/dht_vol.172.17.0.2.data-brick.vol来加载glusterfsd运行期间需要的xlator,这些配置信息是用户创建volume时候系统生成的
4. glusterd启动后进程列表
  • runner_start后继续glusterfsd.c中的main方法,启动对应的服务,glusterfsd正式启动了,就如systemctl start glusterd后出现如下的结果:
5.glusterfsd启动流程
  • 上个步骤通过runner_start加载glusterfsd的二进制进程,glusterfsd进入初始化节点,首先 glusterfs_listener_init 初始化tcp连接
  • glusterfsd通过tcp连接到glusterd获取volume xlator信息,准备构建xlator的graph.
1
2
3
4
5

volume xlator信息如下:

(gdb) p rsp.spec
$6 = 0x7fffe4002fc0 "volume dht_vol-posix\n type storage/posix\n option shared-brick-count 1\n option volume-id 25305c2d-317f-45e9-8e60-be322ed69ee1\n option directory /data/brick\nend-volume\n\nvolume dht_vol-trash\n type features/trash\n option trash-internal-op off\n option brick-path /data/brick\n option trash-dir .trashcan\n subvolumes dht_vol-posix\nend-volume\n\nvolume dht_vol-changelog\n type features/changelog\n option changelog-barrier-timeout 120\n option changelog-dir /data/brick/.glusterfs/changelogs\n option changelog-brick /data/brick\n subvolumes dht_vol-trash\nend-volume\n\nvolume dht_vol-bitrot-stub\n type features/bitrot-stub\n option bitrot disable\n option export /data/brick\n subvolumes dht_vol-changelog\nend-volume\n\nvolume dht_vol-access-control\n type features/access-control\n subvolumes dht_vol-bitrot-stub\nend-volume\n\nvolume dht_vol-locks\n type features/locks\n option enforce-mandatory-lock off\n subvolumes dht_vol-access-control\nend-volume\n\nvolume dht_vol-worm\n type features/worm\n option worm-files-deletable on\n option worm-file-level off\n option worm off\n subvolumes dht_vol-locks\nend-volume\n\nvolume dht_vol-read-only\n type features/read-only\n option read-only off\n subvolumes dht_vol-worm\nend-volume\n\nvolume dht_vol-leases\n type features/leases\n option leases off\n subvolumes dht_vol-read-only\nend-volume\n\nvolume dht_vol-upcall\n type features/upcall\n option cache-invalidation off\n subvolumes dht_vol-leases\nend-volume\n\nvolume dht_vol-io-threads\n type performance/io-threads\n subvolumes dht_vol-upcall\nend-volume\n\nvolume dht_vol-selinux\n type features/selinux\n option selinux on\n subvolumes dht_vol-io-threads\nend-volume\n\nvolume dht_vol-marker\n type features/marker\n option inode-quota off\n option quota off\n option gsync-force-xtime off\n option xtime off\n option quota-version 0\n option timestamp-file /var/lib/glusterd/vols/dht_vol/marker.tstamp\n option volume-uuid 25305c2d-317f-45e9-8e60-be322ed69ee1\n subvolumes dht_vol-selinux\nend-volume\n\nvolume dht_vol-barrier\n type features/barrier\n option barrier-timeout 120\n option barrier disable\n subvolumes dht_vol-marker\nend-volume\n\nvolume dht_vol-index\n type features/index\n option xattrop-pending-watchlist trusted.afr.dht_vol-\n option xattrop-dirty-watchlist trusted.afr.dirty\n option index-base /data/brick/.glusterfs/indices\n subvolumes dht_vol-barrier\nend-volume\n\nvolume dht_vol-quota\n type features/quota\n option deem-statfs off\n option server-quota off\n option volume-uuid dht_vol\n subvolumes dht_vol-index\nend-volume\n\nvolume dht_vol-io-stats\n type debug/io-stats\n option count-fop-hits off\n option latency-measurement off\n option log-level INFO\n option unique-id /data/brick\n subvolumes dht_vol-quota\nend-volume\n\nvolume /data/brick\n type performance/decompounder\n subvolumes dht_vol-io-stats\nend-volume\n\nvolume dht_vol-server\n type protocol/server\n option transport.listen-backlog 1024\n option transport.socket.keepalive-count 9\n option transport.socket.keepalive-interval 2\n option transport.socket.keepalive-time 20\n option transport.socket.ssl-enabled off\n option transport.socket.keepalive 1\n option auth.addr./data/brick.allow *\n option auth-path /data/brick\n option auth.login.b26ac1fd-40a4-4c6e-b211-a9f2b58beedc.password 4eda9afe-9466-4283-b175-6020d5981994\n option auth.login./data/brick.allow b26ac1fd-40a4-4c6e-b211-a9f2b58beedc\n option transport.address-family inet\n option transport-type tcp\n subvolumes /data/brick\nend-volume\n"
  • 根据xlator获取初构建graph信息,初始化每个xlator的init方法,这个时候glusterfsd初始化成功
    6. gdb 调试方法
  • gdb break 信息
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    set print pretty on
    //调试子进程,子进程会加载执行mgmt/glusterd/src/glusterd.c:init
    set follow-fork-mode child
    set detach-on-fork off
    br glusterfsd.c:main
    br br glusterfs_volumes_init //glusterd和glusterfsd都会调用该方法
    br glusterd.c:init
    //------ 进入init函数后执行如下------
    /* 进入这个函数后设置调试父进程
    set follow-fork-mode parent
    set detach-on-fork on
    */
    br glusterd_spawn_daemons
    br glusterd_restart_bricks
    br glusterd_brick_start
    br glusterd_volume_start_glusterfs
    /* 进入这个函数后设置调试父进程
    set follow-fork-mode child
    set detach-on-fork on
    */
    br runner_run
    br runner_log

    br runner_start

    /** 进入runner_start 后设置如下断点,glusterfsd启动以后会按照如下的调用链进行 */
    br glusterfs_volumes_init
    br glusterfs_listener_init //glusterd不会调用该方法
    br glusterfs_mgmt_init //glusterd不会调用该方法
    br mgmt_rpc_notify
    br glusterfs_volfile_fetch
    br glusterfs_volfile_fetch_one
    br mgmt_getspec_cbk
    br glusterfs_process_volfp
    br glusterfs_graph_activate
    br glusterfs_graph_init
  • 1.glusterd gdb日志
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    248
    249
    250
    251
    252
    253
    254
    255
    256
    257
    258
    259
    260
    261
    262
    [root@b94a78ebfc80 ~]$ gdb glusterd
    GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-115.el7
    Copyright (C) 2013 Free Software Foundation, Inc.
    License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
    This is free software: you are free to change and redistribute it.
    There is NO WARRANTY, to the extent permitted by law. Type "show copying"
    and "show warranty" for details.
    This GDB was configured as "x86_64-redhat-linux-gnu".
    For bug reporting instructions, please see:
    <http://www.gnu.org/software/gdb/bugs/>...
    Reading symbols from /usr/local/sbin/glusterfsd...done.
    (gdb) br glusterfsd.c:main
    Breakpoint 1 at 0x40b71d: file glusterfsd.c, line 2746.
    (gdb) br glusterd.c:init
    No source file named glusterd.c.
    Make breakpoint pending on future shared library load? (y or [n]) y
    Breakpoint 2 (glusterd.c:init) pending.
    (gdb) set print pretty on
    (gdb) set follow-fork-mode child
    (gdb) set detach-on-fork off
    (gdb) c
    The program is not being run.
    (gdb) r
    Starting program: /usr/local/sbin/glusterd
    [Thread debugging using libthread_db enabled]
    Using host libthread_db library "/lib64/libthread_db.so.1".

    Breakpoint 1, main (argc=1, argv=0x7fffffffe778) at glusterfsd.c:2746
    2746 glusterfs_ctx_t *ctx = NULL;
    Missing separate debuginfos, use: debuginfo-install glibc-2.17-292.el7.x86_64 libuuid-2.23.2-61.el7_7.1.x86_64 openssl-libs-1.0.2k-19.el7.x86_64 zlib-1.2.7-18.el7.x86_64
    (gdb) c
    Continuing.
    [New process 956]
    [Thread debugging using libthread_db enabled]
    Using host libthread_db library "/lib64/libthread_db.so.1".
    [New Thread 0x7ffff5431700 (LWP 957)]
    [New Thread 0x7ffff4c30700 (LWP 958)]
    [New Thread 0x7ffff442f700 (LWP 959)]
    [New Thread 0x7ffff3c2e700 (LWP 960)]
    [New Thread 0x7ffff342d700 (LWP 961)]
    [New Thread 0x7ffff2c2c700 (LWP 962)]
    Reading symbols from /usr/local/lib/glusterfs//xlator/mgmt/glusterd.so...done.
    [Switching to Thread 0x7ffff7fea4c0 (LWP 956)]

    Breakpoint 2, init (this=0x686d40) at glusterd.c:1373
    1373 int32_t ret = -1;
    Missing separate debuginfos, use: debuginfo-install glibc-2.17-292.el7.x86_64 libuuid-2.23.2-61.el7_7.1.x86_64 libxml2-2.9.1-6.el7_2.3.x86_64 openssl-libs-1.0.2k-19.el7.x86_64 userspace-rcu-0.7.16-1.el7.x86_64 xz-libs-5.2.2-1.el7.x86_64 zlib-1.2.7-18.el7.x86_64
    (gdb) set follow-fork-mode parent
    (gdb) set detach-on-fork on
    (gdb) br glusterd_spawn_daemons
    Breakpoint 3 at 0x7ffff1f79f85: file glusterd-utils.c, line 3619.
    (gdb) br glusterd_restart_bricks
    Breakpoint 4 at 0x7ffff1f832e3: file glusterd-utils.c, line 6334.
    (gdb) br glusterd_brick_start
    Breakpoint 5 at 0x7ffff1f826cd: file glusterd-utils.c, line 6107.
    (gdb) br glusterd_volume_start_glusterfs
    Breakpoint 6 at 0x7ffff1f73f5f: file glusterd-utils.c, line 2024.
    (gdb) c
    Continuing.
    Reading symbols from /usr/local/lib/glusterfs//rpc-transport/socket.so...done.
    Reading symbols from /usr/local/lib/glusterfs//rpc-transport/rdma.so...done.
    Detaching after fork from child process 964.
    Detaching after fork from child process 965.
    Detaching after fork from child process 966.
    Detaching after fork from child process 967.
    Detaching after fork from child process 968.
    Detaching after fork from child process 969.
    Detaching after fork from child process 970.
    Detaching after fork from child process 971.
    Detaching after fork from child process 972.
    Detaching after fork from child process 973.
    Detaching after fork from child process 974.
    Detaching after fork from child process 975.
    Detaching after fork from child process 976.
    Detaching after fork from child process 977.
    Detaching after fork from child process 978.
    Detaching after fork from child process 979.
    Detaching after fork from child process 980.
    Detaching after fork from child process 981.
    Detaching after fork from child process 982.
    Detaching after fork from child process 983.
    Detaching after fork from child process 984.
    Detaching after fork from child process 985.
    Detaching after fork from child process 986.
    Detaching after fork from child process 987.
    Detaching after fork from child process 988.
    [Switching to Thread 0x7ffff342d700 (LWP 961)]

    Breakpoint 3, glusterd_spawn_daemons (opaque=0x0) at glusterd-utils.c:3619
    3619 glusterd_conf_t *conf = THIS->private;
    Missing separate debuginfos, use: debuginfo-install keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-37.el7_7.2.x86_64 libcom_err-1.42.9-16.el7.x86_64 libibverbs-22.1-3.el7.x86_64 libnl3-3.2.28-4.el7.x86_64 librdmacm-22.1-3.el7.x86_64 libselinux-2.5-14.1.el7.x86_64 pcre-8.32-17.el7.x86_64
    (gdb) n
    3620 int ret = -1;
    (gdb)
    3622 synclock_lock(&conf->big_lock);
    (gdb)
    [New Thread 0x7fffef606700 (LWP 989)]
    3623 glusterd_restart_bricks();
    (gdb)

    Breakpoint 4, glusterd_restart_bricks (opaque=0x7fffef9a9fc8) at glusterd-utils.c:6334
    6334 int ret = 0;
    (gdb)
    6335 glusterd_volinfo_t *volinfo = NULL;
    (gdb)
    6336 glusterd_brickinfo_t *brickinfo = NULL;
    (gdb) set follow-fork-mode child
    (gdb) set detach-on-fork on
    (gdb) br runner_run
    Breakpoint 7 at 0x7ffff7b35f04: runner_run. (2 locations)
    (gdb) br runner_start
    Breakpoint 8 at 0x7ffff7b35825: runner_start. (2 locations)
    (gdb) info break
    Num Type Disp Enb Address What
    1 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    1.1 y 0x000000000040b71d in main at glusterfsd.c:2746 inf 1
    1.2 y 0x000000000040b71d in main at glusterfsd.c:2746 inf 2
    2 breakpoint keep y 0x00007ffff1f3a7ee in init at glusterd.c:1373 inf 2
    breakpoint already hit 1 time
    3 breakpoint keep y 0x00007ffff1f79f85 in glusterd_spawn_daemons at glusterd-utils.c:3619 inf 2
    breakpoint already hit 1 time
    4 breakpoint keep y 0x00007ffff1f832e3 in glusterd_restart_bricks at glusterd-utils.c:6334 inf 2
    breakpoint already hit 1 time
    5 breakpoint keep y 0x00007ffff1f826cd in glusterd_brick_start at glusterd-utils.c:6107 inf 2
    6 breakpoint keep y 0x00007ffff1f73f5f in glusterd_volume_start_glusterfs at glusterd-utils.c:2024 inf 2
    7 breakpoint keep y <MULTIPLE>
    7.1 y 0x00007ffff7b35f04 in runner_run at run.c:430 inf 1
    7.2 y 0x00007ffff7b35f04 in runner_run at run.c:430 inf 2
    8 breakpoint keep y <MULTIPLE>
    8.1 y 0x00007ffff7b35825 in runner_start at run.c:259 inf 1
    8.2 y 0x00007ffff7b35825 in runner_start at run.c:259 inf 2
    (gdb) c
    Continuing.

    Breakpoint 5, glusterd_brick_start (volinfo=0x6db4b0, brickinfo=0x6ef490, wait=false, only_connect=false) at glusterd-utils.c:6107
    6107 int ret = -1;
    (gdb) c
    Continuing.
    [New Thread 0x7fffeee05700 (LWP 991)]

    Breakpoint 6, glusterd_volume_start_glusterfs (volinfo=0x6db4b0, brickinfo=0x6ef490, wait=false) at glusterd-utils.c:2024
    2024 int32_t ret = -1;
    (gdb) c
    Continuing.
    [New process 992]
    [Thread debugging using libthread_db enabled]
    Using host libthread_db library "/lib64/libthread_db.so.1".
    [Switching to Thread 0x7ffff342d700 (LWP 992)]

    Breakpoint 8, runner_start (runner=0x7fffef800e00) at run.c:259
    259 int pi[3][2] = {{-1, -1}, {-1, -1}, {-1, -1}};
    (gdb) n
    261 int ret = 0;
    (gdb)
    262 int errno_priv = 0;
    (gdb)
    263 int i = 0;
    (gdb)
    266 if (runner->runerr || !runner->argv) {
    (gdb)
    271 GF_ASSERT(runner->argv[0]);
    (gdb) set follow-fork-mode child
    (gdb) set detach-on-fork on
    (gdb) n
    276 ret = pipe(xpi);
    (gdb)
    277 if (ret != -1)
    (gdb)
    278 ret = fcntl(xpi[1], F_SETFD, FD_CLOEXEC);
    (gdb)
    280 for (i = 0; i < 3; i++) {
    (gdb)
    281 if (runner->chfd[i] != -2)
    (gdb)
    282 continue;
    (gdb)
    280 for (i = 0; i < 3; i++) {
    (gdb)
    281 if (runner->chfd[i] != -2)
    (gdb)
    282 continue;
    (gdb)
    280 for (i = 0; i < 3; i++) {
    (gdb)
    281 if (runner->chfd[i] != -2)
    (gdb)
    282 continue;
    (gdb)
    280 for (i = 0; i < 3; i++) {
    (gdb)
    291 if (ret != -1)
    (gdb)
    292 runner->chpid = fork();
    (gdb)
    [New process 995]
    [Thread debugging using libthread_db enabled]
    Using host libthread_db library "/lib64/libthread_db.so.1".
    [Switching to Thread 0x7ffff342d700 (LWP 995)]
    293 switch (runner->chpid) {
    327 if (ret != -1) {
    (gdb)
    328 int fdv[4] = {0, 1, 2, xpi[1]};
    (gdb)
    330 ret = close_fds_except(fdv, sizeof(fdv) / sizeof(*fdv));
    (gdb)
    333 if (ret != -1) {
    (gdb)
    335 sigemptyset(&set);
    (gdb)
    336 sigprocmask(SIG_SETMASK, &set, NULL);
    (gdb)
    338 execvp(runner->argv[0], runner->argv);
    (gdb) p runner->argv[0]
    $1 = 0x7fffe8000c60 "/usr/local/sbin/glusterfsd"
    (gdb) p runner->argv
    $2 = (char **) 0x7fffe8000970
    (gdb) p/s runner->argv
    $3 = (char **) 0x7fffe8000970
    (gdb) p/s runner->argv[1]
    $4 = 0x7fffe8000ce0 "-s"
    (gdb) p/s runner->argv[2]
    $5 = 0x7fffe8000d50 "172.17.0.2"
    (gdb) p/s runner->argv[3]
    $6 = 0x7fffe8000dc0 "--volfile-id"
    (gdb) p/s runner->argv[4]
    $7 = 0x7fffe8000e30 "dht_vol.172.17.0.2.data-brick"
    (gdb) p/s runner->argv[5]
    $8 = 0x7fffe8000eb0 "-p"
    (gdb) p/s runner->argv[6]
    $9 = 0x7fffe8000f20 "/var/run/gluster/vols/dht_vol/172.17.0.2-data-brick.pid"
    (gdb) p/s runner->argv[7]
    $10 = 0x7fffe8000fc0 "-S"
    (gdb) p/s runner->argv[8]
    $11 = 0x7fffe8001030 "/var/run/gluster/cb954c564295c3be.socket"
    (gdb) p/s runner->argv[9]
    $12 = 0x7fffe80010c0 "--brick-name"
    (gdb) p/s runner->argv[10]
    $13 = 0x7fffe8001130 "/data/brick"
    (gdb) p/s runner->argv[11]
    $14 = 0x7fffe80011a0 "-l"
    (gdb) p/s runner->argv[12]
    $15 = 0x7fffe8001210 "/var/log/glusterfs/bricks/data-brick.log"
    (gdb) p/s runner->argv[13]
    $16 = 0x7fffe80012a0 "--xlator-option"
    (gdb) p/s runner->argv[14]
    $17 = 0x7fffe8001310 "*-posix.glusterd-uuid=88efffbe-ce9e-4117-8d1a-77de0a8ecd75"
    (gdb) p/s runner->argv[15]
    $18 = 0x7fffe80013b0 "--process-name"
    (gdb) p/s runner->argv[16]
    $19 = 0x7fffe8001420 "brick"
    (gdb) p/s runner->argv[17]
    $20 = 0x7fffe8001490 "--brick-port"
    (gdb) p/s runner->argv[18]
    $21 = 0x7fffe8001500 "49152"
    (gdb) p/s runner->argv[19]
    $22 = 0x7fffe8001570 "--xlator-option"
    (gdb) p/s runner->argv[20]
    $23 = 0x7fffe80015e0 "dht_vol-server.listen-port=49152"
    (gdb) p/s runner->argv[21]
    $24 = 0x0
    (gdb)
  • glusterfsd gdb 日志
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    (gdb) info break
    Num Type Disp Enb Address What
    1 breakpoint keep y <MULTIPLE>
    breakpoint already hit 2 times
    1.1 y 0x000000000040b71d in main at glusterfsd.c:2746 inf 1
    1.2 y 0x000000000040b71d in main at glusterfsd.c:2746 inf 5, 4, 3
    2 breakpoint keep y <PENDING> glusterd.c:init inf 5, 4, 3
    breakpoint already hit 1 time
    3 breakpoint keep y <PENDING> glusterd_spawn_daemons inf 5, 4, 3
    breakpoint already hit 1 time
    4 breakpoint keep y <PENDING> glusterd_restart_bricks inf 5, 4, 3
    breakpoint already hit 1 time
    5 breakpoint keep y <PENDING> glusterd_brick_start inf 5, 4, 3
    breakpoint already hit 1 time
    6 breakpoint keep y <PENDING> glusterd_volume_start_glusterfs inf 5, 4, 3
    breakpoint already hit 1 time
    7 breakpoint keep y <MULTIPLE>
    7.1 y 0x00007ffff7b35f04 in runner_run at run.c:430 inf 1
    7.2 y 0x00007ffff7b35f04 in runner_run at run.c:430 inf 5, 4, 3
    8 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    8.1 y 0x00007ffff7b35825 in runner_start at run.c:259 inf 1
    8.2 y 0x00007ffff7b35825 in runner_start at run.c:259 inf 5, 4, 3
    9 breakpoint keep y <PENDING> server-rpc-fops_v2.c:server4_0_mkdir
    10 breakpoint keep y <PENDING> server-resolve.c:server_resolve
    11 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    11.1 y 0x000000000040b603 in glusterfs_volumes_init at glusterfsd.c:2704 inf 1
    11.2 y 0x000000000040b603 in glusterfs_volumes_init at glusterfsd.c:2704 inf 5, 4, 3
    12 breakpoint keep y <MULTIPLE>
    12.1 y 0x000000000040b390 in glusterfs_process_volfp at glusterfsd.c:2626 inf 1
    12.2 y 0x000000000040b390 in glusterfs_process_volfp at glusterfsd.c:2626 inf 5, 4, 3
    13 breakpoint keep y <MULTIPLE>
    13.1 y 0x000000000040710f in get_volfp at glusterfsd.c:781 inf 1
    13.2 y 0x000000000040710f in get_volfp at glusterfsd.c:781 inf 5, 4, 3
    14 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    14.1 y 0x0000000000412581 in glusterfs_mgmt_init at glusterfsd-mgmt.c:2648 inf 1
    14.2 y 0x0000000000412581 in glusterfs_mgmt_init at glusterfsd-mgmt.c:2648 inf 5, 4, 3
    15 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    ---Type <return> to continue, or q <return> to quit---
    15.1 y 0x0000000000411848 in glusterfs_volfile_fetch at glusterfsd-mgmt.c:2270 inf 1
    15.2 y 0x0000000000411848 in glusterfs_volfile_fetch at glusterfsd-mgmt.c:2270 inf 5, 4, 3
    16 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    16.1 y 0x00000000004114ed in glusterfs_volfile_fetch_one at glusterfsd-mgmt.c:2166 inf 1
    16.2 y 0x00000000004114ed in glusterfs_volfile_fetch_one at glusterfsd-mgmt.c:2166 inf 5, 4, 3
    17 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    17.1 y 0x0000000000410a85 in mgmt_getspec_cbk at glusterfsd-mgmt.c:1922 inf 1
    17.2 y 0x0000000000410a85 in mgmt_getspec_cbk at glusterfsd-mgmt.c:1922 inf 5, 4, 3
    18 breakpoint keep y <MULTIPLE>
    18.1 y 0x000000000040b390 in glusterfs_process_volfp at glusterfsd.c:2626 inf 1
    18.2 y 0x000000000040b390 in glusterfs_process_volfp at glusterfsd.c:2626 inf 5, 4, 3
    (gdb)

现象

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
[2020-04-15 08:37:04.727619] I [rpc-clnt.c:1963:rpc_clnt_reconfig] 0-hot2_vol-client-2: changing port to 49153 (from 0)
[2020-04-15 08:37:04.727653] I [socket.c:864:__socket_shutdown] 0-hot2_vol-client-2: intentional socket shutdown(12)
[2020-04-15 08:37:04.730524] I [MSGID: 114057] [client-handshake.c:1376:select_server_supported_programs] 0-hot2_vol-client-1: Using Program GlusterFS 4.x v1, Num (1298437), Version (400)
[2020-04-15 08:37:04.730986] I [MSGID: 114046] [client-handshake.c:1106:client_setvolume_cbk] 0-hot2_vol-client-1: Connected to hot2_vol-client-1, attached to remote volume '/glusterfs/hotvol2/brick1'.
[2020-04-15 08:37:04.733549] I [MSGID: 114057] [client-handshake.c:1376:select_server_supported_programs] 0-hot2_vol-client-2: Using Program GlusterFS 4.x v1, Num (1298437), Version (400)
[2020-04-15 08:37:04.733979] I [MSGID: 114046] [client-handshake.c:1106:client_setvolume_cbk] 0-hot2_vol-client-2: Connected to hot2_vol-client-2, attached to remote volume '/glusterfs/hotvol2/brick1'.
[2020-04-15 08:37:04.735104] I [fuse-bridge.c:5166:fuse_init] 0-glusterfs-fuse: FUSE inited with protocol versions: glusterfs 7.24 kernel 7.22
[2020-04-15 08:37:04.735112] I [fuse-bridge.c:5777:fuse_graph_sync] 0-fuse: switched to graph 0
pending frames:
frame : type(1) op(LOOKUP)
frame : type(1) op(OPEN)
patchset: git://git.gluster.org/glusterfs.git
signal received: 11
time of crash:
2020-04-15 08:50:04
configuration details:
argp 1
backtrace 1
dlfcn 1
libpthread 1
llistxattr 1
setfsid 1
spinlock 1
epoll.h 1
xattr.h 1
st_atim.tv_nsec 1
package-string: glusterfs 7.1
/lib64/libglusterfs.so.0(+0x277ff)[0x2b4618afe7ff]
/lib64/libglusterfs.so.0(gf_print_trace+0x334)[0x2b4618b09234]
/lib64/libc.so.6(+0x36280)[0x2b461a424280]
/lib64/libglusterfs.so.0(+0xaeeb5)[0x2b4618b85eb5]
/lib64/libpthread.so.0(+0x7dd5)[0x2b4619d78dd5]
/lib64/libc.so.6(clone+0x6d)[0x2b461a4ebead]
---------

系统版本信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
[root@szdpl1543 ~]# uname -a
Linux szdpl1543 3.10.0-957.el7.x86_64 #1 SMP Thu Nov 8 23:39:32 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
[root@szdpl1543 ~]# gluster --version
glusterfs 7.2
Repository revision: git://git.gluster.org/glusterfs.git
Copyright (c) 2006-2016 Red Hat, Inc. <https://www.gluster.org/>
GlusterFS comes with ABSOLUTELY NO WARRANTY.
It is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3
or later), or the GNU General Public License, version 2 (GPLv2),
in all cases as published by the Free Software Foundation.
[root@szdpl1543 ~]# gluster volume info hot2_vol

Volume Name: hot2_vol
Type: Distribute
Volume ID: a04761f1-3380-4795-9054-cf152e55ea61
Status: Started
Snapshot Count: 0
Number of Bricks: 3
Transport-type: tcp
Bricks:
Brick1: 172.25.78.240:/glusterfs/hotvol2/brick1
Brick2: 172.25.78.241:/glusterfs/hotvol2/brick1
Brick3: 172.25.78.242:/glusterfs/hotvol2/brick1
Options Reconfigured:
storage.health-check-interval: 0
performance.write-behind: on
performance.flush-behind: on
performance.aggregate-size: 1mb
performance.read-ahead-page-count: 16
performance.client-io-threads: off
performance.io-cache: on
nfs.disable: on
storage.fips-mode-rchecksum: on
transport.address-family: inet
server.outstanding-rpc-limit: 512
network.tcp-window-size: 1048576
cluster.min-free-disk: 20
server.event-threads: 64
client.event-threads: 32
performance.io-thread-count: 32
performance.readdir-ahead: on
cluster.readdir-optimize: on
performance.parallel-readdir: on
performance.quick-read: on
cluster.lookup-unhashed: off
performance.rda-cache-limit: 16GB
cluster.lookup-optimize: on
performance.stat-prefetch: on
features.cache-invalidation: on
features.cache-invalidation-timeout: 600
performance.cache-invalidation: on
performance.md-cache-timeout: 600
network.inode-lru-limit: 200000

mount选项

1
mount -t glusterfs -o acl,global-threading 172.25.78.240:hot2_vol /data/glusterfs_train_hot2

解决方案

1
2
3
4
//每当接收到打开的调用时,它都会向应用程序发送成功通知,从而提高了应用程序从文件读取数据的能力
Option: performance.open-behind
Default Value: on
Description: enable/disable open-behind translator in the volume.

相关的issue

Automatic File Replication

Afr xlator in glusterfs is responsible for replicating the data across the bricks.

Responsibilities of AFR

Its responsibilities include the following:

  1. Maintain replication consistency (i.e. Data on both the bricks should be same, even in the cases where there are operations happening on same file/directory in parallel from multiple applications/mount points as long as all the bricks in replica set are up)

  2. Provide a way of recovering data in case of failures as long as there is
    at least one brick which has the correct data.

  3. Serve fresh data for read/stat/readdir etc

Transaction framework

For 1, 2 above afr uses transaction framework which consists of the following 5
phases which happen on all the bricks in replica set(Bricks which are in replication):

1.Lock Phase

2. Pre-op Phase

3. Op Phase

4. Post-op Phase

5. Unlock Phase

Op Phase is the actual operation sent by application (write/create/unlink etc). For every operation which afr receives that modifies data it sends that same operation in parallel to all the bricks in its replica set. This is how it achieves replication.

Lock, Unlock Phases take necessary locks so that Op phase can provide replication consistency in normal work flow.

For example:

If an application performs touch a and the other one does mkdir a, afr makes sure that either file with name a or directory with name a is created on both the bricks.

Pre-op, Post-op Phases provide changelogging which enables afr to figure out which copy is fresh.
Once afr knows how to figure out fresh copy in the replica set it can recover data from fresh copy to stale copy. Also it can serve fresh data for read/stat/readdir etc.

Internal Operations

Brief introduction to internal operations in Glusterfs which make Locking, Unlocking, Pre/Post ops possible:

Internal Locking Operations

Glusterfs has locks translator which provides the following internal locking operations called inodelk, entrylk which are used by afr to achieve synchronization of operations on files or directories that conflict with each other.

Inodelk gives the facility for translators in Glusterfs to obtain range (denoted by tuple with offset, length) locks in a given domain for an inode.
Full file lock is denoted by the tuple (offset: 0, length: 0) i.e. length 0 is considered as infinity.

Entrylk enables translators of Glusterfs to obtain locks on name in a given domain for an inode, typically a directory.

Locks translator provides both blocking and nonblocking variants and of these operations.

Xattrop

For pre/post ops posix translator provides an operation called xattrop.
xattrop is a way of incrementing/decrementing a number present in the extended attribute of the inode atomically.

Transaction Types

There are 3 types of transactions in AFR.

  1. Data transactions

    • Operations that add/modify/truncate the file contents.
    • Write/Truncate/Ftruncate etc
  2. Metadata transactions

    • Operations that modify the data kept in inode.
    • Chmod/Chown etc

3) Entry transactions

  • Operations that add/remove/rename entries in a directory
  • Touch/Mkdir/Mknod etc

Data transactions:

write (offset, size) - writes data from offset of size

ftruncate/truncate (offset) - truncates data from offset till the end of file.

Afr internal locking needs to make sure that two conflicting data operations happen in order, one after the other so that it does not result in replication inconsistency. Afr data operations take inodelks in same domain (lets call it data domain).

Write operation with offset O and size S takes an inode lock in data domain with range (O, S).

Ftruncate/Truncate operations with offset O take inode locks in data domain with range (O, 0). Please note that size 0 means size infinity.

These ranges make sure that overlapping write/truncate/ftruncate operations are done one after the other.

Now that we know the ranges the operations take locks on, we will see how locking happens in afr.

Lock:

Afr initially attempts non-blocking locks on all the bricks of the replica set in parallel. If all the locks are successful then it goes on to perform pre-op. But in case non-blocking locks fail because there is at least one conflicting operation which already has a granted lock then it unlocks the non-blocking locks it already acquired in this previous step and proceeds to perform blocking locks one after the other on each of the subvolumes in the order of subvolumes specified in the volfile.

Chances of conflicting operations is very low and time elapsed in non-blocking locks phase is Max(latencies of the bricks for responding to inodelk), where as time elapsed in blocking locks phase is Sum(latencies of the bricks for responding to inodelk). That is why afr always tries for non-blocking locks first and only then it moves to blocking locks.

Pre-op:

Each file/dir in a brick maintains the changelog(roughly pending operation count) of itself and that of the files
present in all the other bricks in it’s replica set as seen by that brick.

Lets consider an example replica volume with 2 bricks brick-a and brick-b.
all files in brick-a will have 2 entries
one for itself and the other for the file present in it’s replica set, i.e.brick-b:
One can inspect changelogs using getfattr command.

# getfattr -d -e hex -m. brick-a/file.txt
trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for itself (brick-a)
trusted.afr.vol-client-1=0x000000000000000000000000 -->changelog for brick-b as seen by brick-a

Likewise, all files in brick-b will have:

# getfattr -d -e hex -m. brick-b/file.txt
trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for brick-a as seen by brick-b
trusted.afr.vol-client-1=0x000000000000000000000000 -->changelog for itself (brick-b)
Interpreting Changelog Value:

Each extended attribute has a value which is 24 hexa decimal digits. i.e. 12 bytes
First 8 digits (4 bytes) represent changelog of data. Second 8 digits represent changelog
of metadata. Last 8 digits represent Changelog of directory entries.

Pictorially representing the same, we have:

0x 00000000 00000000 00000000
      |        |        |
      |        |        \_ changelog of directory entries
      |        \_ changelog of metadata
      \ _ changelog of data

Before write operation is performed on the brick, afr marks the file saying there is a pending operation.

As part of this pre-op afr sends xattrop operation with increment 1 for data operation to make the extended attributes the following:
# getfattr -d -e hex -m. brick-a/file.txt
trusted.afr.vol-client-0=0x000000010000000000000000 –>changelog for itself (brick-a)
trusted.afr.vol-client-1=0x000000010000000000000000 –>changelog for brick-b as seen by brick-a

Likewise, all files in brick-b will have:

# getfattr -d -e hex -m. brick-b/file.txt
trusted.afr.vol-client-0=0x000000010000000000000000 -->changelog for brick-a as seen by brick-b
trusted.afr.vol-client-1=0x000000010000000000000000 -->changelog for itself (brick-b)

As the operation is in progress on files on both the bricks all the extended attributes show the same value.

Op:

Now it sends the actual write operation to both the bricks. Afr remembers whether the operation is successful or not on all the subvolumes.

Post-Op:

If the operation succeeds on all the bricks then there is no pending operations on any of the bricks so as part of POST-OP afr sends xattrop operation with increment -1 i.e. decrement by 1 for data operation to make the extended attributes back to all zeros again.

In case there is a failure on brick-b then there is still a pending operation on brick-b where as no pending operations are there on brick-a. So xattrop operation for both of these extended attributes differs now. For extended attribute corresponding to brick-a i.e. trusted.afr.vol-client-0 decrement by 1 is sent where as for extended attribute corresponding to brick-b increment by ‘0’ is sent to retain the pending operation count.

# getfattr -d -e hex -m. brick-a/file.txt
trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for itself (brick-a)
trusted.afr.vol-client-1=0x000000010000000000000000 -->changelog for brick-b as seen by brick-a

# getfattr -d -e hex -m. brick-b/file.txt
trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for brick-a as seen by brick-b
trusted.afr.vol-client-1=0x000000010000000000000000 -->changelog for itself (brick-b)

Unlock:

Once the transaction is completed unlock is sent on all the bricks where lock is acquired.

Meta Data transactions:

setattr, setxattr, removexattr
All metadata operations take same inode lock with same range in metadata domain.

Lock:

Metadata locking also starts initially with non-blocking locks then move on to blocking locks on any failures because of conflicting operations.

Pre-op:

Before metadata operation is performed on the brick, afr marks the file saying there is a pending operation.
As part of this pre-op afr sends xattrop operation with increment 1 for metadata operation to make the extended attributes the following:
# getfattr -d -e hex -m. brick-a/file.txt
trusted.afr.vol-client-0=0x000000000000000100000000 –>changelog for itself (brick-a)
trusted.afr.vol-client-1=0x000000000000000100000000 –>changelog for brick-b as seen by brick-a

Likewise, all files in brick-b will have:
# getfattr -d -e hex -m. brick-b/file.txt
trusted.afr.vol-client-0=0x000000000000000100000000 –>changelog for brick-a as seen by brick-b
trusted.afr.vol-client-1=0x000000000000000100000000 –>changelog for itself (brick-b)

As the operation is in progress on files on both the bricks all the extended attributes show the same value.

Op:

Now it sends the actual metadata operation to both the bricks. Afr remembers whether the operation is successful or not on all the subvolumes.

Post-Op:
If the operation succeeds on all the bricks then there is no pending operations on any of the bricks so as part of POST-OP afr sends xattrop operation with increment -1 i.e. decrement by 1 for metadata operation to make the extended attributes back to all zeros again.

In case there is a failure on brick-b then there is still a pending operation on brick-b where as no pending operations are there on brick-a. So xattrop operation for both of these extended attributes differs now. For extended attribute corresponding to brick-a i.e. trusted.afr.vol-client-0 decrement by 1 is sent where as for extended attribute corresponding to brick-b increment by ‘0’ is sent to retain the pending operation count.

# getfattr -d -e hex -m. brick-a/file.txt
trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for itself (brick-a)
trusted.afr.vol-client-1=0x000000000000000100000000 -->changelog for brick-b as seen by brick-a

# getfattr -d -e hex -m. brick-b/file.txt
trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for brick-a as seen by brick-b
trusted.afr.vol-client-1=0x000000000000000100000000 -->changelog for itself (brick-b)

Unlock:

Once the transaction is completed unlock is sent on all the bricks where lock is acquired.

Entry transactions:

create, mknod, mkdir, link, symlink, rename, unlink, rmdir
Pre-op/Post-op (done using xattrop) always happens on the parent directory.

Entry Locks taken by these entry operations:

Create (file dir/a): Lock on name a in inode of dir

mknod (file dir/a): Lock on name a in inode of dir

mkdir (dir dir/a): Lock on name a in inode of dir

link (file oldfile, file dir/newfile): Lock on name newfile in inode of dir

Symlink (file oldfile, file dir/symlinkfile): Lock on name symlinkfile in inode of dir

rename of (file dir1/file1, file dir2/file2): Lock on name file1 in inode of dir1, Lock on name file2 in inode of dir2

rename of (dir dir1/dir2, dir dir3/dir4): Lock on name dir2 in inode of dir1, Lock on name dir4 in inode of dir3, Lock on NULL in inode of dir4

unlink (file dir/a): Lock on name a in inode of dir

rmdir (dir dir/a): Lock on name a in inode of dir, Lock on NULL in inode of a

Lock:

Even entry locking starts initially with non-blocking locks then move on to blocking locks on any failures because of conflicting operations.

Pre-op:

Before entry operation is performed on the brick, afr marks the directory saying there is a pending operation.

As part of this pre-op afr sends xattrop operation with increment 1 for entry operation to make the extended attributes the following:

# getfattr -d -e hex -m. brick-a/
trusted.afr.vol-client-0=0x000000000000000000000001 -->changelog for itself (brick-a)
trusted.afr.vol-client-1=0x000000000000000000000001 -->changelog for brick-b as seen by brick-a

Likewise, all files in brick-b will have:
# getfattr -d -e hex -m. brick-b/
trusted.afr.vol-client-0=0x000000000000000000000001 –>changelog for brick-a as seen by brick-b
trusted.afr.vol-client-1=0x000000000000000000000001 –>changelog for itself (brick-b)

As the operation is in progress on files on both the bricks all the extended attributes show the same value.

Op:

Now it sends the actual entry operation to both the bricks. Afr remembers whether the operation is successful or not on all the subvolumes.

Post-Op:

If the operation succeeds on all the bricks then there is no pending operations on any of the bricks so as part of POST-OP afr sends xattrop operation with increment -1 i.e. decrement by 1 for entry operation to make the extended attributes back to all zeros again.

In case there is a failure on brick-b then there is still a pending operation on brick-b where as no pending operations are there on brick-a. So xattrop operation for both of these extended attributes differs now. For extended attribute corresponding to brick-a i.e. trusted.afr.vol-client-0 decrement by 1 is sent where as for extended attribute corresponding to brick-b increment by ‘0’ is sent to retain the pending operation count.

# getfattr -d -e hex -m. brick-a/file.txt
trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for itself (brick-a)
trusted.afr.vol-client-1=0x000000000000000000000001 -->changelog for brick-b as seen by brick-a

# getfattr -d -e hex -m. brick-b/file.txt
trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for brick-a as seen by brick-b
trusted.afr.vol-client-1=0x000000000000000000000001 -->changelog for itself (brick-b)

Unlock:

Once the transaction is completed unlock is sent on all the bricks where lock is acquired.

The parts above cover how replication consistency is achieved in afr.

Now let us look at how afr can figure out how to recover from failures given the changelog extended attributes

Recovering from failures (Self-heal)

For recovering from failures afr tries to determine which copy is the fresh copy based on the extended attributes.

There are 3 possibilities:

  1. All the extended attributes are zero on all the bricks. This means there are no pending operations on any of the bricks so there is nothing to recover.

  2. According to the extended attributes there is a brick(brick-a) which noticed that there are operations pending on the other brick(brick-b).

    • There are 4 possibilities for brick-b

      • It did not even participate in transaction (all extended attributes on brick-b are zeros). Choose brick-a as source and perform recovery to brick-b.

      • It participated in the transaction but died even before post-op. (All extended attributes on brick-b have a pending-count). Choose brick-a as source and perform recovery to brick-b.

      • It participated in the transaction and after the post-op extended attributes on brick-b show that there are pending operations on itself. Choose brick-a as source and perform recovery to brick-b.

      • It participated in the transaction and after the post-op extended attributes on brick-b show that there are pending operations on brick-a. This situation is called Split-brain and there is no way to recover. This situation can happen in cases of network partition.

  3. The only possibility now is where both brick-a, brick-b have pending operations. In this case changelogs extended attributes are all non-zeros on all the bricks. Basically what could have happened is the operations started on the file but either the whole replica set went down or the mount process itself dies before post-op is performed. In this case there is a possibility that data on the bricks is different. In this case afr chooses file with bigger size as source, if both files have same size then it choses the subvolume which has witnessed large number of pending operations on the other brick as source. If both have same number of pending operations then it chooses the file with newest ctime as source. If this is also same then it just picks one of the two bricks as source and syncs data on to the other to make sure that the files are replicas to each other.

Self-healing:

Afr does 3 types of self-heals for data recovery.

  1. Data self-heal

  2. Metadata self-heal

  3. Entry self-heal

As we have seen earlier, afr depends on changelog extended attributes to figure out which copy is source and which copy is sink. General algorithm for performing this recovery (self-heal) is same for all of these different self-heals.

  1. Take appropriate full locks on the file/directory to make sure no other transaction is in progress while inspecting changelog extended attributes.
    In this step, for

    • Data self-heal afr takes inode lock with offset: 0 and size: 0(infinity) in data domain.
    • Entry self-heal takes entry lock on directory with NULL name i.e. full directory lock.
    • Metadata self-heal it takes pre-defined range in metadata domain on which all the metadata operations on that inode take locks on. To prevent duplicate data self-heal an inode lock is taken in self-heal domain as well.
  2. Perform Sync from fresh copy to stale copy.
    In this step,

    • Metadata self-heal gets the inode attributes, extended attributes from source copy and sets them on the stale copy.

    • Entry self-heal reads entries on stale directories and see if they are present on source directory, if they are not present it deletes them. Then it reads entries on fresh directory and creates the missing entries on stale directories.

    • Data self-heal does things a bit differently to make sure no other writes on the file are blocked for the duration of self-heal because files sizes could be as big as 100G(VM files) and we don’t want to block all the transactions until the self-heal is over. Locks translator allows two overlapping locks to be granted if they are from same lock owner. Using this what data self-heal does is it takes a small 128k size range lock and unlock previous acquired lock, heals just that 128k chunk and takes next 128k chunk lock and unlock previous lock and moves to the next one. It always makes sure that at least one lock is present on the file by selfheal throughout the duration of self-heal so that two self-heals don’t happen in parallel.

    • Data self-heal has two algorithms, where the file can be copied only when there is data mismatch for that chunk called as ‘diff’ self-heal. The other one is blind copy of each chunk called ‘full’ self-heal

  3. Change extended attributes to mark new sources after the sync.

  4. Unlock the locks acquired to perform self-heal.

Transaction Optimizations:

As we saw earlier afr transaction for all the operations that modify data happens in 5 phases, i.e. it sends 5 operations on the network for every operation. In the following sections we will see optimizations already implemented in afr which reduce the number of operations on the network to just 1 per transaction in best case.

Changelog-piggybacking

This optimization comes into picture when on same file descriptor, before write1’s post op is complete write2’s pre-op starts and the operations are succeeding. When writes come in that manner we can piggyback on the pre-op of write1 for write2 and somehow tell write1 that write2 will do the post-op that was supposed to be done by write1. So write1’s post-op does not happen over network, write2’s pre-op does not happen over network. This optimization does not hold if there are any failures in write1’s phases.

Delayed Post-op

This optimization just delays post-op of the write transaction(write1) by a pre-configured amount time to increase the probability of next write piggybacking on the pre-op done by write1.

With the combination of these two optimizations for operations like full file copy which are write intensive operations, what will essentially happen is for the first write a pre-op will happen. Then for the last write on the file post-op happens. So for all the write transactions between first write and last write afr reduced network operations from 5 to 3.

Eager-locking:

This optimization comes into picture when only one file descriptor is open on the file and performing writes just like in the previous optimization. What this optimization does is it takes a full file lock on the file irrespective of the offset, size of the write, so that lock acquired by write1 can be piggybacked by write2 and write2 takes the responsibility of unlocking it. both write1, write2 will have same lock owner and afr takes the responsibility of serializing overlapping writes so that replication consistency is maintained.

With the combination of these optimizations for operations like full file copy which are write intensive operations, what will essentially happen is for the first write a pre-op, full-file lock will happen. Then for the last write on the file post-op, unlock happens. So for all the write transactions between first write and last write afr reduced network operations from 5 to 1.

Quorum in afr:

To avoid split-brains, afr employs the following quorum policies.

  • In replica set with odd number of bricks, replica set is said to be in quorum if more than half of the bricks are up.
  • In replica set with even number of bricks, if more than half of the bricks are up then it is said to be in quorum but if number of bricks that are up is equal to number of bricks that are down then, it is said to be in quorum if the first brick is also up in the set of bricks that are up.

When quorum is not met in the replica set then modify operations on the mount are not allowed by afr.

Self-heal daemon and Index translator usage by afr:

Index xlator

On each brick index xlator is loaded. This xlator keeps track of what is happening in afr’s pre-op and post-op. If there is an ongoing I/O or a pending self-heal, changelog xattrs would have non-zero values. Whenever xattrop/fxattrop fop (pre-op, post-ops are done using these fops) comes to index xlator a link (with gfid as name of the file on which the fop is performed) is added in /.glusterfs/indices/xattrop directory. If the value returned by the fop is zero the link is removed from the index otherwise it is kept until zero is returned in the subsequent xattrop/fxattrop fops.

Self-heal-daemon:

self-heal-daemon process keeps running on each machine of the trusted storage pool. This process has afr xlators of all the volumes which are started. Its job is to crawl indices on bricks that are local to that machine. If any of the files represented by the gfid of the link name need healing and automatically heal them. This operation is performed every 10 minutes for each replica set. Additionally when a brick comes online also this operation is performed.

How GlusterFS Distribution Works

The defining feature of any scale-out system is its ability to distribute work
or data among many servers. Accordingly, people in the distributed-system
community have developed many powerful techniques to perform such distribution,
but those techniques often remain little known or understood even among other
members of the file system and database communities that benefit. This
confusion is represented even in the name of the GlusterFS component that
performs distribution - DHT, which stands for Distributed Hash Table but is not
actually a DHT as that term is most commonly used or defined. The way
GlusterFS’s DHT works is based on a few basic principles:

  • All operations are driven by clients, which are all equal. There are no
    special nodes with special knowledge of where files are or should be.

  • Directories exist on all subvolumes (bricks or lower-level aggregations of
    bricks); files exist on only one.

  • Files are assigned to subvolumes based on consistent hashing, and even
    more specifically a form of consistent hashing exemplified by Amazon’s
    Dynamo.

The result of all this is that users are presented with a set of files that is
the union of the files present on all subvolumes. The following sections
describe how this “uniting” process actually works.

Layouts

The conceptual basis of Dynamo-style consistent hashing is of numbers around a
circle, like a clock. First, the circle is divided into segments and those
segments are assigned to bricks. (For the sake of simplicity we’ll use
“bricks” hereafter even though they might actually be replicated/striped
subvolumes.) Several factors guide this assignment.

  • Assignments are done separately for each directory.

  • Historically, segments have all been the same size. However, this can lead
    to smaller bricks becoming full while plenty of space remains on larger
    ones. If the cluster.weighted-rebalance option is set, segments sizes
    will be proportional to brick sizes.

  • Assignments need not include all bricks in the volume. If the
    cluster.subvols-per-directory option is set, only that many bricks will
    receive assignments for that directory.

However these assignments are done, they collectively become what we call a
layout for a directory. This layout is then stored using extended
attributes, with each brick’s copy of that extended attribute on that directory
consisting of four 32-bit fields.

  • A version, which might be DHT_HASH_TYPE_DM to represent an assignment as
    described above, or DHT_HASH_TYPE_DM_USER to represent an assignment made
    manually by the user (or external script).

  • A “commit hash” which will be described later.

  • The first number in the assigned range (segment).

  • The last number in the assigned range.

For example, the extended attributes representing a weighted assignment between
three bricks, one twice as big as the others, might look like this.

  • Brick A (the large one): DHT_HASH_TYPE_DM 1234 0 0x7ffffff

  • Brick B: DHT_HASH_TYPE_DM 1234 0x80000000 0xbfffffff

  • Brick C: DHT_HASH_TYPE_DM 1234 0xc0000000 0xffffffff

Placing Files

To place a file in a directory, we first need a layout for that directory - as
described above. Next, we calculate a hash for the file. To minimize
collisions either between files in the same directory with different names or
between files in different directories with the same name, this hash is
generated using both the (containing) directory’s unique GFID and the file’s
name. This hash is then matched to one of the layout assignments, to yield
what we call a hashed location. For example, consider the layout shown
above. The hash 0xabad1dea is between 0x80000000 and 0xbfffffff, so the
corresponding file’s hashed location would be on Brick B. A second file with a
hash of 0xfaceb00c would be assigned to Brick C by the same reasoning.

Looking Up Files

Because layout assignments might change, especially as bricks are added or
removed, finding a file involves more than calculating its hashed location and
looking there. That is in fact the first step, and works most of the time -
i.e. the file is found where we expected it to be - but there are a few more
steps when that’s not the case. Historically, the next step has been to look
for the file everywhere - i.e. to broadcast our lookup request to all
subvolumes. If the file isn’t found that way, it doesn’t exist. At this
point, an open that requires the file’s presence will fail, or a create/mkdir
that requires its absence will be allowed to continue.

Regardless of whether a file is found at its hashed location or elsewhere, we
now know its cached location. As the name implies, this is stored within DHT
to satisfy future lookups. If it’s not the same as the hashed location, we
also take an extra step. This step is the creation of a linkfile, which is a
special stub left at the hashed location pointing to the cached
location. Therefore, if a client naively looks for a file at its hashed
location and finds a linkfile instead, it can use that linkfile to look up the
file where it really is instead of needing to inquire everywhere.

Rebalancing

As bricks are added or removed, or files are renamed, many files can end up
somewhere other than at their hashed locations. When this happens, the volumes
need to be rebalanced. This process consists of two parts.

  1. Calculate new layouts, according to the current set of bricks (and possibly
    their characteristics). We call this the “fix-layout” phase.

  2. Migrate any “misplaced” files to their correct (hashed) locations, and
    clean up any linkfiles which are no longer necessary. We call this the
    “migrate-data” phase.

Usually, these two phases are done together. (In fact, the code for them is
somewhat intermingled.) However, the migrate-data phase can involve a lot of
I/O and be very disruptive, so users can do just the fix-layout phase and defer
migrate-data until a more convenient time. This allows new files to be placed
on new bricks, even though old files might still be in the “wrong” place.

When calculating a new layout to replace an old one, DHT specifically tries to
maximize overlap of the assigned ranges, thus minimizing data movement. This
difference can be very large. For example, consider the case where our example
layout from earlier is updated to add a new double-sided brick. Here’s a very
inefficient way to do that.

  • Brick A (the large one): 0x00000000 to 0x55555555

  • Brick B: 0x55555556 to 0x7fffffff

  • Brick C: 0x80000000 to 0xaaaaaaaa

  • Brick D (the new one): 0xaaaaaaab to 0xffffffff

This would cause files in the following ranges to be migrated:

  • 0x55555556 to 0x7fffffff (from A to B)

  • 0x80000000 to 0xaaaaaaaa (from B to C)

  • 0xaaaaaaab to 0xbfffffff (from B to D)

  • 0xc0000000 to 0xffffffff (from C to D)

As an historical note, this is exactly what we used to do, and in this case it
would have meant moving 7/12 of all files in the volume. Now let’s consider a
new layout that’s optimized to maximize overlap with the old one.

  • Brick A: 0x00000000 to 0x55555555

  • Brick D: 0x55555556 to 0xaaaaaaaa <- optimized insertion point

  • Brick B: 0xaaaaaaab to 0xd5555554

  • Brick C: 0xd5555555 to 0xffffffff

In this case we only need to move 5/12 of all files. In a volume with millions
or even billions of files, reducing data movement by 1/6 of all files is a
pretty big improvement. In the future, DHT might use “virtual node IDs” or
multiple hash rings to make rebalancing even more efficient.

Rename Optimizations

With the file-lookup mechanisms we already have in place, it’s not necessary to
move a file from one brick to another when it’s renamed - even across
directories. It will still be found, albeit a little less efficiently. The
first client to look for it after the rename will add a linkfile, which every
other client will follow from then on. Also, every client that has found the
file once will continue to find it based on its cached location, without any
network traffic at all. Because the extra lookup cost is small, and the
movement cost might be very large, DHT renames the file “in place” on its
current brick instead (taking advantage of the fact that directories exist
everywhere).

This optimization is further extended to handle cases where renames are very
common. For example, rsync and similar tools often use a “write new then
rename” idiom in which a file “xxx” is actually written as “.xxx.1234” and then
moved into place only after its contents have been fully written. To make this
process more efficient, DHT uses a regular expression to separate the permanent
part of a file’s name (in this case “xxx”) from what is likely to be a
temporary part (the leading “.” and trailing “.1234”). That way, after the
file is renamed it will be in its correct hashed location - which it wouldn’t
be otherwise if “xxx” and “.xxx.1234” hash differently - and no linkfiles or
broadcast lookups will be necessary.

In fact, there are two regular expressions available for this purpose -
cluster.rsync-hash-regex and cluster.extra-hash-regex. As its name
implies, rsync-hash-regex defaults to the pattern that regex uses, while
extra-hash-regex can be set by the user to support a second tool using the
same temporary-file idiom.

Commit Hashes

A very recent addition to DHT’s algorithmic arsenal is intended to reduce the
number of “broadcast” lookups the it issues. If a volume is completely in
balance, then no file could exist anywhere but at its hashed location.
Therefore, if we’ve already looked there and not found it, then looking
elsewhere would be pointless (and wasteful). The commit hash mechanism is
used to detect this case. A commit hash is assigned to a volume, and
separately to each directory, and then updated according to the following
rules.

  • The volume commit hash is changed whenever actions are taken that might
    cause layout assignments across all directories to become invalid - i.e.
    bricks being added, removed, or replaced.

  • The directory commit hash is changed whenever actions are taken that might
    cause files to be “misplaced” - e.g. when they’re renamed.

  • The directory commit hash is set to the volume commit hash when the
    directory is created, and whenever the directory is fully rebalanced so that
    all files are at their hashed locations.

In other words, whenever either the volume or directory commit hash is changed
that creates a mismatch. In that case we revert to the “pessimistic”
broadcast-lookup method described earlier. However, if the two hashes match
then we can with skip the broadcast lookup and return a result immediately.
This has been observed to cause a 3x performance improvement in workloads that
involve creating many small files across many bricks.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
core: implement a global thread pool

This patch implements a thread pool that is wait-free for adding jobs to
the queue and uses a very small locked region to get jobs. This makes it
possible to decrease contention drastically. It's based on wfcqueue
structure provided by urcu library.

It automatically enables more threads when load demands it, and stops
them when not needed. There's a maximum number of threads that can be
used. This value can be configured.

Depending on the workload, the maximum number of threads plays an
important role. So it needs to be configured for optimal performance.
Currently the thread pool doesn't self adjust the maximum for the
workload, so this configuration needs to be changed manually.

For this reason, the global thread pool has been made optional, so that
volumes can still use the thread pool provided by io-threads.

To enable it for bricks, the following option needs to be set:

config.global-threading = on

This option has no effect if bricks are already running. A restart is
required to activate it. It's recommended to also enable the following
option when running bricks with the global thread pool:

performance.iot-pass-through = on

To enable it for a FUSE mount point, the option '--global-threading'
must be added to the mount command. To change it, an umount and remount
is needed. It's recommended to disable the following option when using
global threading on a mount point:

performance.client-io-threads = off

Currently it can only be enabled for bricks and FUSE mounts.

The maximum number of threads for clients and bricks can be configured
using the following options:

config.client-threads
config.brick-threads

These options can be applied online and its effect is immediate most of
the times. If one of them is set to 0, the maximum number of threads
will be calcutated as #cores * 2.

Some distributions use a very old userspace-rcu library (version 0.7)
for this reason, some header files from version 0.10 have been copied
into contrib/userspace-rcu and are used if the detected version is 0.7
or older.

If you also want to enable global-threading for fuse clients, you also need to set this option:

# gluster volume set <volname> performance.client-io-threads off
And remount the volume using --global-threading mount option.

gluster volume set dht_vol performance.client-io-threads off
mount -t glusterfs -o acl,global-threading 127.0.0.1:/dht_vol /mnt/data

workflow for pipeline

pipeline-v1.jpg

runing result

1
enter contrl+c  to stop pipeline program

result.png

code example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
package main

import (
"log"
"math/rand"
"os"
"os/signal"
"syscall"
"time"
)

/*
classic pipeline demo write by perrynzhou@gmail.com
*/
const (
batchSize = 8
)

/*
note:
for range 在chan上有如下特性
1.如果chan上有数据,则for 继续往下执行,如果chan没有数据则for 会阻塞
2.如果chan被close了,则chan为nil,for range会退出循环。
*/
type PipeFeature struct {
input1 chan int64
input2 chan int64
input3 chan int64
done chan struct{}
stop chan struct{}
}

func NewPipeFeature() *PipeFeature {
return &PipeFeature{
input1: make(chan int64, batchSize),
input2: make(chan int64, batchSize),
input3: make(chan int64, batchSize),
done: make(chan struct{}),
stop: make(chan struct{}),
}
}
func (p *PipeFeature) init() {
log.Println("...init running...")
defer close(p.input1)
for {
select {
case <-p.done:
log.Println("...init stop...")
return
default:
time.Sleep(5 * time.Millisecond)
p.input1 <- rand.Int63n(65535)
}
}
}
func (p *PipeFeature) stage1() {
log.Println("...stage1 running...")
defer close(p.input2)
for v := range p.input1 { //will block util input1 close
v = v - rand.Int63n(1024)
p.input2 <- v
}
log.Println("stage1 done...")
}
func (p *PipeFeature) stage2() {
log.Println("...stage2 running...")
defer close(p.input3)
for v := range p.input2 {
v = v + 1
p.input3 <- v
}
log.Println("stage2 done...")
}
func (p *PipeFeature) stage3() {
log.Println("...stage3 running...")
for v3 := range p.input3 { //will block
v3 = v3 + rand.Int63n(100)
}
log.Println("stage3 done...")
}
func (p *PipeFeature) Run() {
log.Println("start pipeline...")
go p.init() //order2- recv data from done and closed input1, return this function
go p.stage1() order 3-if input1 is closed,break for loop, and close input2 before return
go p.stage2() //order 4-if input2 is closed ,break for range input2 and close input3 before return
// order 5- if input3 is closed,stage3 return
p.stage3() // will block util input3 closed after call stage2
p.stop <- struct{}{} // order 6-send stop flag to stop chan before end Run function
}
func (p *PipeFeature) Stop() {
p.done <- struct{}{} // order 1-let init function to stop
//order 7 - already recv data from stop chan
<-p.stop //wait for recv stop chan
log.Println("stop pipeline...")
}
func main() {
pipe := NewPipeFeature()
defer pipe.Stop()
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, os.Interrupt, syscall.SIGTERM, syscall.SIGINT)
go pipe.Run()
for {
select {
case <-sigs:
log.Println("recieve stop signal")
return
}
}
}