glusterfs

Форк
0
/
rebalance.py 
308 строк · 11.6 Кб
1
#!/usr/bin/python3
2

3
from __future__ import print_function
4

5
import atexit
6
import copy
7
import optparse
8
import os
9
import pipes
10
import shutil
11
import string
12
import subprocess
13
import sys
14
import tempfile
15
import volfilter
16
import platform
17

18
# It's just more convenient to have named fields.
19
class Brick:
20
        def __init__ (self, path, name):
21
                self.path = path
22
                self.sv_name = name
23
                self.size = 0
24
                self.curr_size = 0
25
                self.good_size = 0
26
        def set_size (self, size):
27
                self.size = size
28
        def set_range (self, rs, re):
29
                self.r_start = rs
30
                self.r_end = re
31
                self.curr_size = self.r_end - self.r_start + 1
32
        def __repr__ (self):
33
                value = self.path[:]
34
                value += "(%d," % self.size
35
                if self.curr_size:
36
                        value += "0x%x,0x%x)" % (self.r_start, self.r_end)
37
                else:
38
                        value += "-)"
39
                return value
40

41
def get_bricks (host, vol):
42
        t = pipes.Template()
43
        t.prepend("gluster --remote-host=%s system getspec %s"%(host, vol), ".-")
44
        return t.open(None, "r")
45

46
def generate_stanza (vf, all_xlators, cur_subvol):
47
        sv_list = []
48
        for sv in cur_subvol.subvols:
49
                generate_stanza(vf, all_xlators, sv)
50
                sv_list.append(sv.name)
51
        vf.write("volume %s\n" % cur_subvol.name)
52
        vf.write("  type %s\n" % cur_subvol.type)
53
        for kvpair in cur_subvol.opts.items():
54
                vf.write("  option %s %s\n" % kvpair)
55
        if sv_list:
56
                vf.write("  subvolumes %s\n" % ''.join(sv_list))
57
        vf.write("end-volume\n\n")
58

59

60
def mount_brick (localpath, all_xlators, dht_subvol):
61

62
        # Generate a volfile.
63
        vf_name = localpath + ".vol"
64
        vf = open(vf_name, "w")
65
        generate_stanza(vf, all_xlators, dht_subvol)
66
        vf.flush()
67
        vf.close()
68

69
        # Create a brick directory and mount the brick there.
70
        os.mkdir(localpath)
71
        subprocess.call(["glusterfs", "-f", vf_name, localpath])
72

73
# We use the command-line tools because there's no getxattr support in the
74
# Python standard library (which is ridiculous IMO).  Adding the xattr package
75
# from PyPI would create a new and difficult dependency because the bits to
76
# satisfy it don't seem to exist in Fedora.  We already expect the command-line
77
# tools to be there, so it's safer just to rely on them.
78
#
79
# We might have to revisit this if we get as far as actually issuing millions
80
# of setxattr requests.  Even then, it might be better to do that part with a C
81
# program which has only a build-time dependency.
82
def get_range (brick):
83
        t = pipes.Template()
84
        cmd = "getfattr -e hex -n trusted.glusterfs.dht %s 2> /dev/null"
85
        t.prepend(cmd%brick, ".-")
86
        t.append("grep ^trusted.glusterfs.dht=", "--")
87
        f = t.open(None, "r")
88
        try:
89
                value = f.readline().rstrip().split('=')[1][2:]
90
        except:
91
                print("could not get layout for %s (might be OK)" % brick)
92
                return None
93
        v_start = int("0x"+value[16:24], 16)
94
        v_end = int("0x"+value[24:32], 16)
95
        return (v_start, v_end)
96

97
def calc_sizes (bricks, total):
98
        leftover = 1 << 32
99
        for b in bricks:
100
               if b.size:
101
                        b.good_size = (b.size << 32) / total
102
                        leftover -= b.good_size
103
               else:
104
                        b.good_size = 0
105
        if leftover:
106
                # Add the leftover to an old brick if we can.
107
                for b in bricks:
108
                        if b.good_size:
109
                                b.good_size += leftover
110
                                break
111
                else:
112
                        # Fine, just add it wherever.
113
                        bricks[0].good_size += leftover
114

115
# Normalization means sorting the bricks by r_start and (b) ensuring that there
116
# are no gaps.
117
def normalize (in_bricks):
118
        out_bricks = []
119
        curr_hash = 0
120
        used = 0
121
        while curr_hash < (1<<32):
122
                curr_best = None
123
                for b in in_bricks:
124
                        if b.r_start == curr_hash:
125
                                used += 1
126
                                out_bricks.append(b)
127
                                in_bricks.remove(b)
128
                                curr_hash = b.r_end + 1
129
                                break
130
                else:
131
                        print("gap found at 0x%08x" % curr_hash)
132
                        sys.exit(1)
133
        return out_bricks + in_bricks, used
134

135
def get_score (bricks):
136
        score = 0
137
        curr_hash = 0
138
        for b in bricks:
139
                if not b.curr_size:
140
                        curr_hash += b.good_size
141
                        continue
142
                new_start = curr_hash
143
                curr_hash += b.good_size
144
                new_end = curr_hash - 1
145
                if new_start > b.r_start:
146
                        max_start = new_start
147
                else:
148
                        max_start = b.r_start
149
                if new_end < b.r_end:
150
                        min_end = new_end
151
                else:
152
                        min_end = b.r_end
153
                if max_start <= min_end:
154
                        score += (min_end - max_start + 1)
155
        return score
156

157
if __name__ == "__main__":
158

159
        my_usage = "%prog [options] server volume [directory]"
160
        parser = optparse.OptionParser(usage=my_usage)
161
        parser.add_option("-f", "--free-space", dest="free_space",
162
                          default=False, action="store_true",
163
                          help="use free space instead of total space")
164
        parser.add_option("-l", "--leave-mounted", dest="leave_mounted",
165
                          default=False, action="store_true",
166
                          help="leave subvolumes mounted")
167
        parser.add_option("-v", "--verbose", dest="verbose",
168
                          default=False, action="store_true",
169
                          help="verbose output")
170
        options, args = parser.parse_args()
171

172
        if len(args) == 3:
173
                fix_dir = args[2]
174
        else:
175
                if len(args) != 2:
176
                        parser.print_help()
177
                        sys.exit(1)
178
                fix_dir = None
179
        hostname, volname = args[:2]
180

181
        # Make sure stuff gets cleaned up, even if there are exceptions.
182
        orig_dir = os.getcwd()
183
        work_dir = tempfile.mkdtemp()
184
        bricks = []
185
        def cleanup_workdir ():
186
                os.chdir(orig_dir)
187
                if options.verbose:
188
                        print("Cleaning up %s" % work_dir)
189
                for b in bricks:
190
                        subprocess.call(["umount", b.path])
191
                shutil.rmtree(work_dir)
192
        if not options.leave_mounted:
193
                atexit.register(cleanup_workdir)
194
        os.chdir(work_dir)
195

196
        # Mount each brick individually, so we can issue brick-specific calls.
197
        if options.verbose:
198
                print("Mounting subvolumes...")
199
        index = 0
200
        volfile_pipe = get_bricks(hostname, volname)
201
        all_xlators, last_xlator = volfilter.load(volfile_pipe)
202
        for dht_vol in all_xlators.itervalues():
203
                if dht_vol.type == "cluster/distribute":
204
                        break
205
        else:
206
                print("no DHT volume found")
207
                sys.exit(1)
208
        for sv in dht_vol.subvols:
209
                #print "found subvol %s" % sv.name
210
                lpath = "%s/brick%s" % (work_dir, index)
211
                index += 1
212
                mount_brick(lpath, all_xlators, sv)
213
                bricks.append(Brick(lpath, sv.name))
214
        if index == 0:
215
                print("no bricks")
216
                sys.exit(1)
217

218
        # Collect all of the sizes.
219
        if options.verbose:
220
                print("Collecting information...")
221
        total = 0
222
        for b in bricks:
223
                info = os.statvfs(b.path)
224
                # On FreeBSD f_bsize (info[0]) contains the optimal I/O size,
225
                # not the block size as it's found on Linux. In this case we
226
                # use f_frsize (info[1]).
227
                if platform.system() == 'FreeBSD':
228
                        bsize = info[1]
229
                else:
230
                        bsize = info[0]
231
                # We want a standard unit even if different bricks use
232
                # different block sizes.  The size is chosen to avoid overflows
233
                # for very large bricks with very small block sizes, but also
234
                # accommodate filesystems which use very large block sizes to
235
                # cheat on benchmarks.
236
                blocksper100mb = 104857600 / bsize
237
                if options.free_space:
238
                        size = info[3] / blocksper100mb
239
                else:
240
                        size = info[2] / blocksper100mb
241
                if size <= 0:
242
                        print("brick %s has invalid size %d" % (b.path, size))
243
                        sys.exit(1)
244
                b.set_size(size)
245
                total += size
246

247
        # Collect all of the layout information.
248
        for b in bricks:
249
                hash_range = get_range(b.path)
250
                if hash_range is not None:
251
                        rs, re = hash_range
252
                        if rs > re:
253
                                print("%s has backwards hash range" % b.path)
254
                                sys.exit(1)
255
                        b.set_range(hash_range[0], hash_range[1])
256

257
        if options.verbose:
258
                print("Calculating new layouts...")
259
        calc_sizes(bricks, total)
260
        bricks, used = normalize(bricks)
261

262
        # We can't afford O(n!) here, but O(n^2) should be OK and the result
263
        # should be almost as good.
264
        while used < len(bricks):
265
                best_place = used
266
                best_score = get_score(bricks)
267
                for i in range(used):
268
                        new_bricks = bricks[:]
269
                        del new_bricks[used]
270
                        new_bricks.insert(i, bricks[used])
271
                        new_score = get_score(new_bricks)
272
                        if new_score > best_score:
273
                                best_place = i
274
                                best_score = new_score
275
                if best_place != used:
276
                        nb = bricks[used]
277
                        del bricks[used]
278
                        bricks.insert(best_place, nb)
279
                used += 1
280

281
        # Finalize whatever we decided on.
282
        curr_hash = 0
283
        for b in bricks:
284
                b.r_start = curr_hash
285
                curr_hash += b.good_size
286
                b.r_end = curr_hash - 1
287

288
        print("Here are the xattr values for your size-weighted layout:")
289
        for b in bricks:
290
                print("  %s: 0x0000000200000000%08x%08x" % (
291
                        b.sv_name, b.r_start, b.r_end))
292

293
        if fix_dir:
294
                if options.verbose:
295
                        print("Fixing layout for %s" % fix_dir)
296
                for b in bricks:
297
                        value = "0x0000000200000000%08x%08x" % (
298
                                b.r_start, b.r_end)
299
                        path = "%s/%s" % (b.path, fix_dir)
300
                        cmd = "setfattr -n trusted.glusterfs.dht -v %s %s" % (
301
                                value, path)
302
                        print(cmd)
303

304
        if options.leave_mounted:
305
                print("The following subvolumes are still mounted:")
306
                for b in bricks:
307
                        print("%s on %s" % (b.sv_name, b.path))
308
                print("Don't forget to clean up when you're done.")
309

310

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.