bcc/tools/zfsslower.py at master · chiluk/bcc

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

#!/usr/bin/python

# @lint-avoid-python-3-compatibility-imports

#

# zfsslower Trace slow ZFS operations.

# For Linux, uses BCC, eBPF.

#

# USAGE: zfsslower [-h] [-j] [-p PID] [min_ms]

#

# This script traces common ZFS file operations: reads, writes, opens, and

# syncs. It measures the time spent in these operations, and prints details

# for each that exceeded a threshold.

#

# WARNING: This adds low-overhead instrumentation to these ZFS operations,

# including reads and writes from the file system cache. Such reads and writes

# can be very frequent (depending on the workload; eg, 1M/sec), at which

# point the overhead of this tool (even if it prints no "slower" events) can

# begin to become significant.

#

# This works by using kernel dynamic tracing of the ZPL interface, and will

# need updates to match any changes to this interface.

#

# By default, a minimum millisecond threshold of 10 is used.

#

# Licensed under the Apache License, Version 2.0 (the "License")

#

# 14-Feb-2016 Brendan Gregg Created this.

# 16-Oct-2016 Dina Goldshtein -p to filter by process ID.

from __future__ import print_function

from bcc import BPF

import argparse

from time import strftime

import ctypes as ct

# arguments

examples = """examples:

./zfsslower # trace operations slower than 10 ms (default)

./zfsslower 1 # trace operations slower than 1 ms

./zfsslower -j 1 # ... 1 ms, parsable output (csv)

./zfsslower 0 # trace all operations (warning: verbose)

./zfsslower -p 185 # trace PID 185 only

"""

parser = argparse.ArgumentParser(

description="Trace common ZFS file operations slower than a threshold",

formatter_class=argparse.RawDescriptionHelpFormatter,

epilog=examples)

parser.add_argument("-j", "--csv", action="store_true",

help="just print fields: comma-separated values")

parser.add_argument("-p", "--pid",

help="trace this PID only")

parser.add_argument("min_ms", nargs="?", default='10',

help="minimum I/O duration to trace, in ms (default 10)")

parser.add_argument("--ebpf", action="store_true",

help=argparse.SUPPRESS)

args = parser.parse_args()

min_ms = int(args.min_ms)

pid = args.pid

csv = args.csv

debug = 0

# define BPF program

bpf_text = """

#include <uapi/linux/ptrace.h>

#include <linux/fs.h>

#include <linux/sched.h>

#include <linux/dcache.h>

// XXX: switch these to char's when supported

#define TRACE_READ 0

#define TRACE_WRITE 1

#define TRACE_OPEN 2

#define TRACE_FSYNC 3

struct val_t {

u64 ts;

u64 offset;

struct file *fp;

};

struct data_t {

// XXX: switch some to u32's when supported

u64 ts_us;

u64 type;

u64 size;

u64 offset;

u64 delta_us;

u64 pid;

char task[TASK_COMM_LEN];

char file[DNAME_INLINE_LEN];

};

BPF_HASH(entryinfo, u64, struct val_t);

BPF_PERF_OUTPUT(events);

//

// Store timestamp and size on entry

//

// zpl_read(), zpl_write():

int trace_rw_entry(struct pt_regs *ctx, struct file *filp, char __user *buf,

size_t len, loff_t *ppos)

{

u64 id = bpf_get_current_pid_tgid();

u32 pid = id >> 32; // PID is higher part

if (FILTER_PID)

return 0;

// store filep and timestamp by id

struct val_t val = {};

val.ts = bpf_ktime_get_ns();

val.fp = filp;

val.offset = *ppos;

if (val.fp)

entryinfo.update(&id, &val);

return 0;

}

// zpl_open():

int trace_open_entry(struct pt_regs *ctx, struct inode *inode,

struct file *filp)

{

u64 id = bpf_get_current_pid_tgid();

u32 pid = id >> 32; // PID is higher part

if (FILTER_PID)

return 0;

// store filep and timestamp by id

struct val_t val = {};

val.ts = bpf_ktime_get_ns();

val.fp = filp;

val.offset = 0;

if (val.fp)

entryinfo.update(&id, &val);

return 0;

}

// zpl_fsync():

int trace_fsync_entry(struct pt_regs *ctx, struct file *filp)

{

u64 id = bpf_get_current_pid_tgid();

u32 pid = id >> 32; // PID is higher part

if (FILTER_PID)

return 0;

// store filp and timestamp by id

struct val_t val = {};

val.ts = bpf_ktime_get_ns();

val.fp = filp;

val.offset = 0;

if (val.fp)

entryinfo.update(&id, &val);

return 0;

}

//

// Output

//

static int trace_return(struct pt_regs *ctx, int type)

{

struct val_t *valp;

u64 id = bpf_get_current_pid_tgid();

u32 pid = id >> 32; // PID is higher part

valp = entryinfo.lookup(&id);

if (valp == 0) {

// missed tracing issue or filtered

return 0;

}

// calculate delta

u64 ts = bpf_ktime_get_ns();

u64 delta_us = (ts - valp->ts) / 1000;

entryinfo.delete(&id);

if (FILTER_US)

return 0;

// populate output struct

u32 size = PT_REGS_RC(ctx);

struct data_t data = {.type = type, .size = size, .delta_us = delta_us,

.pid = pid};

data.ts_us = ts / 1000;

data.offset = valp->offset;

bpf_get_current_comm(&data.task, sizeof(data.task));

struct qstr qs = valp->fp->f_path.dentry->d_name;

if (qs.len == 0)

return 0;

bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name);

// output

events.perf_submit(ctx, &data, sizeof(data));

return 0;

}

int trace_read_return(struct pt_regs *ctx)

{

return trace_return(ctx, TRACE_READ);

}

int trace_write_return(struct pt_regs *ctx)

{

return trace_return(ctx, TRACE_WRITE);

}

int trace_open_return(struct pt_regs *ctx)

{

return trace_return(ctx, TRACE_OPEN);

}

int trace_fsync_return(struct pt_regs *ctx)

{

return trace_return(ctx, TRACE_FSYNC);

}

"""

if min_ms == 0:

bpf_text = bpf_text.replace('FILTER_US', '0')

else:

bpf_text = bpf_text.replace('FILTER_US',

'delta_us <= %s' % str(min_ms * 1000))

if args.pid:

bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)

else:

bpf_text = bpf_text.replace('FILTER_PID', '0')

if debug or args.ebpf:

print(bpf_text)

if args.ebpf:

exit()

# kernel->user event data: struct data_t

DNAME_INLINE_LEN = 32 # linux/dcache.h

TASK_COMM_LEN = 16 # linux/sched.h

class Data(ct.Structure):

_fields_ = [

("ts_us", ct.c_ulonglong),

("type", ct.c_ulonglong),

("size", ct.c_ulonglong),

("offset", ct.c_ulonglong),

("delta_us", ct.c_ulonglong),

("pid", ct.c_ulonglong),

("task", ct.c_char * TASK_COMM_LEN),

("file", ct.c_char * DNAME_INLINE_LEN)

]

# process event

def print_event(cpu, data, size):

event = ct.cast(data, ct.POINTER(Data)).contents

type = 'R'

if event.type == 1:

type = 'W'

elif event.type == 2:

type = 'O'

elif event.type == 3:

type = 'S'

if (csv):

print("%d,%s,%d,%s,%d,%d,%d,%s" % (

event.ts_us, event.task.decode('utf-8', 'replace'), event.pid,

type, event.size, event.offset, event.delta_us,

event.file.decode('utf-8', 'replace')))

return

print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"),

event.task.decode('utf-8', 'replace'), event.pid, type, event.size,

event.offset / 1024, float(event.delta_us) / 1000,

event.file.decode('utf-8', 'replace')))

# initialize BPF

b = BPF(text=bpf_text)

# common file functions

if BPF.get_kprobe_functions(b'zpl_iter'):

b.attach_kprobe(event="zpl_iter_read", fn_name="trace_rw_entry")

b.attach_kprobe(event="zpl_iter_write", fn_name="trace_rw_entry")

elif BPF.get_kprobe_functions(b'zpl_aio'):

b.attach_kprobe(event="zpl_aio_read", fn_name="trace_rw_entry")

b.attach_kprobe(event="zpl_aio_write", fn_name="trace_rw_entry")

else:

b.attach_kprobe(event="zpl_read", fn_name="trace_rw_entry")

b.attach_kprobe(event="zpl_write", fn_name="trace_rw_entry")

b.attach_kprobe(event="zpl_open", fn_name="trace_open_entry")

b.attach_kprobe(event="zpl_fsync", fn_name="trace_fsync_entry")

if BPF.get_kprobe_functions(b'zpl_iter'):

b.attach_kretprobe(event="zpl_iter_read", fn_name="trace_read_return")

b.attach_kretprobe(event="zpl_iter_write", fn_name="trace_write_return")

elif BPF.get_kprobe_functions(b'zpl_aio'):

b.attach_kretprobe(event="zpl_aio_read", fn_name="trace_read_return")

b.attach_kretprobe(event="zpl_aio_write", fn_name="trace_write_return")

else:

b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return")

b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return")

b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return")

b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return")

# header

if (csv):

print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")

else:

if min_ms == 0:

print("Tracing ZFS operations")

else:

print("Tracing ZFS operations slower than %d ms" % min_ms)

print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T",

"BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))

# read events

b["events"].open_perf_buffer(print_event, page_cnt=64)

while 1:

b.perf_buffer_poll()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

zfsslower.py

zfsslower.py

Files

zfsslower.py

Latest commit

History

zfsslower.py

File metadata and controls