1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
|
#!/usr/bin/python
import atexit
import copy
import optparse
import os
import pipes
import shutil
import string
import subprocess
import sys
import tempfile
import volfilter
import platform
# It's just more convenient to have named fields.
class Brick:
def __init__ (self, path, name):
self.path = path
self.sv_name = name
self.size = 0
self.curr_size = 0
self.good_size = 0
def set_size (self, size):
self.size = size
def set_range (self, rs, re):
self.r_start = rs
self.r_end = re
self.curr_size = self.r_end - self.r_start + 1
def __repr__ (self):
value = self.path[:]
value += "(%d," % self.size
if self.curr_size:
value += "0x%x,0x%x)" % (self.r_start, self.r_end)
else:
value += "-)"
return value
def get_bricks (host, vol):
t = pipes.Template()
t.prepend("gluster --remote-host=%s system getspec %s"%(host,vol),".-")
return t.open(None,"r")
def generate_stanza (vf, all_xlators, cur_subvol):
sv_list = []
for sv in cur_subvol.subvols:
generate_stanza(vf,all_xlators,sv)
sv_list.append(sv.name)
vf.write("volume %s\n"%cur_subvol.name)
vf.write(" type %s\n"%cur_subvol.type)
for kvpair in cur_subvol.opts.iteritems():
vf.write(" option %s %s\n"%kvpair)
if sv_list:
vf.write(" subvolumes %s\n"%string.join(sv_list))
vf.write("end-volume\n\n")
def mount_brick (localpath, all_xlators, dht_subvol):
# Generate a volfile.
vf_name = localpath + ".vol"
vf = open(vf_name,"w")
generate_stanza(vf,all_xlators,dht_subvol)
vf.flush()
vf.close()
# Create a brick directory and mount the brick there.
os.mkdir(localpath)
subprocess.call(["glusterfs","-f",vf_name,localpath])
# We use the command-line tools because there's no getxattr support in the
# Python standard library (which is ridiculous IMO). Adding the xattr package
# from PyPI would create a new and difficult dependency because the bits to
# satisfy it don't seem to exist in Fedora. We already expect the command-line
# tools to be there, so it's safer just to rely on them.
#
# We might have to revisit this if we get as far as actually issuing millions
# of setxattr requests. Even then, it might be better to do that part with a C
# program which has only a build-time dependency.
def get_range (brick):
t = pipes.Template()
cmd = "getfattr -e hex -n trusted.glusterfs.dht %s 2> /dev/null"
t.prepend(cmd%brick,".-")
t.append("grep ^trusted.glusterfs.dht=","--")
f = t.open(None,"r")
try:
value = f.readline().rstrip().split('=')[1][2:]
except:
print "could not get layout for %s (might be OK)" % brick
return None
v_start = int("0x"+value[16:24],16)
v_end = int("0x"+value[24:32],16)
return (v_start, v_end)
def calc_sizes (bricks, total):
leftover = 1 << 32
for b in bricks:
if b.size:
b.good_size = (b.size << 32) / total
leftover -= b.good_size
else:
b.good_size = 0
if leftover:
# Add the leftover to an old brick if we can.
for b in bricks:
if b.good_size:
b.good_size += leftover
break
else:
# Fine, just add it wherever.
bricks[0].good_size += leftover
# Normalization means sorting the bricks by r_start and (b) ensuring that there
# are no gaps.
def normalize (in_bricks):
out_bricks = []
curr_hash = 0
used = 0
while curr_hash < (1<<32):
curr_best = None
for b in in_bricks:
if b.r_start == curr_hash:
used += 1
out_bricks.append(b)
in_bricks.remove(b)
curr_hash = b.r_end + 1
break
else:
print "gap found at 0x%08x" % curr_hash
sys.exit(1)
return out_bricks + in_bricks, used
def get_score (bricks):
score = 0
curr_hash = 0
for b in bricks:
if not b.curr_size:
curr_hash += b.good_size
continue
new_start = curr_hash
curr_hash += b.good_size
new_end = curr_hash - 1
if new_start > b.r_start:
max_start = new_start
else:
max_start = b.r_start
if new_end < b.r_end:
min_end = new_end
else:
min_end = b.r_end
if max_start <= min_end:
score += (min_end - max_start + 1)
return score
if __name__ == "__main__":
my_usage = "%prog [options] server volume [directory]"
parser = optparse.OptionParser(usage=my_usage)
parser.add_option("-f", "--free-space", dest="free_space",
default=False, action="store_true",
help="use free space instead of total space")
parser.add_option("-l", "--leave-mounted", dest="leave_mounted",
default=False, action="store_true",
help="leave subvolumes mounted")
parser.add_option("-v", "--verbose", dest="verbose",
default=False, action="store_true",
help="verbose output")
options, args = parser.parse_args()
if len(args) == 3:
fix_dir = args[2]
else:
if len(args) != 2:
parser.print_help()
sys.exit(1)
fix_dir = None
hostname, volname = args[:2]
# Make sure stuff gets cleaned up, even if there are exceptions.
orig_dir = os.getcwd()
work_dir = tempfile.mkdtemp()
bricks = []
def cleanup_workdir ():
os.chdir(orig_dir)
if options.verbose:
print "Cleaning up %s" % work_dir
for b in bricks:
subprocess.call(["umount",b.path])
shutil.rmtree(work_dir)
if not options.leave_mounted:
atexit.register(cleanup_workdir)
os.chdir(work_dir)
# Mount each brick individually, so we can issue brick-specific calls.
if options.verbose:
print "Mounting subvolumes..."
index = 0
volfile_pipe = get_bricks(hostname,volname)
all_xlators, last_xlator = volfilter.load(volfile_pipe)
for dht_vol in all_xlators.itervalues():
if dht_vol.type == "cluster/distribute":
break
else:
print "no DHT volume found"
sys.exit(1)
for sv in dht_vol.subvols:
#print "found subvol %s" % sv.name
lpath = "%s/brick%s" % (work_dir, index)
index += 1
mount_brick(lpath,all_xlators,sv)
bricks.append(Brick(lpath,sv.name))
if index == 0:
print "no bricks"
sys.exit(1)
# Collect all of the sizes.
if options.verbose:
print "Collecting information..."
total = 0
for b in bricks:
info = os.statvfs(b.path)
# On FreeBSD f_bsize (info[0]) contains the optimal I/O size,
# not the block size as it's found on Linux. In this case we
# use f_frsize (info[1]).
if platform.system() == 'FreeBSD':
bsize = info[1]
else:
bsize = info[0]
# We want a standard unit even if different bricks use
# different block sizes. The size is chosen to avoid overflows
# for very large bricks with very small block sizes, but also
# accommodate filesystems which use very large block sizes to
# cheat on benchmarks.
blocksper100mb = 104857600 / bsize
if options.free_space:
size = info[3] / blocksper100mb
else:
size = info[2] / blocksper100mb
if size <= 0:
print "brick %s has invalid size %d" % (b.path, size)
sys.exit(1)
b.set_size(size)
total += size
# Collect all of the layout information.
for b in bricks:
hash_range = get_range(b.path)
if hash_range is not None:
rs, re = hash_range
if rs > re:
print "%s has backwards hash range" % b.path
sys.exit(1)
b.set_range(hash_range[0],hash_range[1])
if options.verbose:
print "Calculating new layouts..."
calc_sizes(bricks,total)
bricks, used = normalize(bricks)
# We can't afford O(n!) here, but O(n^2) should be OK and the result
# should be almost as good.
while used < len(bricks):
best_place = used
best_score = get_score(bricks)
for i in xrange(used):
new_bricks = bricks[:]
del new_bricks[used]
new_bricks.insert(i,bricks[used])
new_score = get_score(new_bricks)
if new_score > best_score:
best_place = i
best_score = new_score
if best_place != used:
nb = bricks[used]
del bricks[used]
bricks.insert(best_place,nb)
used += 1
# Finalize whatever we decided on.
curr_hash = 0
for b in bricks:
b.r_start = curr_hash
curr_hash += b.good_size
b.r_end = curr_hash - 1
print "Here are the xattr values for your size-weighted layout:"
for b in bricks:
print " %s: 0x0000000200000000%08x%08x" % (
b.sv_name, b.r_start, b.r_end)
if fix_dir:
if options.verbose:
print "Fixing layout for %s" % fix_dir
for b in bricks:
value = "0x0000000200000000%08x%08x" % (
b.r_start, b.r_end)
path = "%s/%s" % (b.path, fix_dir)
cmd = "setfattr -n trusted.glusterfs.dht -v %s %s" % (
value, path)
print cmd
if options.leave_mounted:
print "The following subvolumes are still mounted:"
for b in bricks:
print "%s on %s" % (b.sv_name, b.path)
print "Don't forget to clean up when you're done."
|