diff -Nru fio-2.16/appveyor.yml fio-3.1/appveyor.yml --- fio-2.16/appveyor.yml 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/appveyor.yml 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,27 @@ +clone_depth: 50 +environment: + MAKEFLAGS: -j 2 + matrix: + - platform: x86_64 + BUILD_ARCH: x64 + CYG_ROOT: C:\cygwin64 + CONFIGURE_OPTIONS: + - platform: x86 + BUILD_ARCH: x86 + CYG_ROOT: C:\cygwin + CONFIGURE_OPTIONS: --build-32bit-win + +build_script: + - SET PATH=%CYG_ROOT%\bin;%PATH% + - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && ./configure --extra-cflags=\"-Werror\" ${CONFIGURE_OPTIONS} && make.exe' + +after_build: + - cd os\windows && dobuild.cmd %BUILD_ARCH% + +test_script: + - SET PATH=%CYG_ROOT%\bin;%PATH% + - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && file.exe fio.exe && make.exe test' + +artifacts: + - path: os\windows\*.msi + name: msi diff -Nru fio-2.16/arch/arch-aarch64.h fio-3.1/arch/arch-aarch64.h --- fio-2.16/arch/arch-aarch64.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/arch/arch-aarch64.h 2017-09-28 10:23:20.000000000 +0000 @@ -27,4 +27,8 @@ #define ARCH_HAVE_FFZ +#ifdef ARCH_HAVE_CRC_CRYPTO +#define ARCH_HAVE_ARM64_CRC_CRYPTO +#endif + #endif diff -Nru fio-2.16/arch/arch-arm.h fio-3.1/arch/arch-arm.h --- fio-2.16/arch/arch-arm.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/arch/arch-arm.h 2017-09-28 10:23:20.000000000 +0000 @@ -14,6 +14,8 @@ #define nop __asm__ __volatile__ ("nop") #define read_barrier() __sync_synchronize() #define write_barrier() __sync_synchronize() +#else +#error "unsupported ARM architecture" #endif #endif diff -Nru fio-2.16/arch/arch.h fio-3.1/arch/arch.h --- fio-2.16/arch/arch.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/arch/arch.h 2017-09-28 10:23:20.000000000 +0000 @@ -1,6 +1,8 @@ #ifndef ARCH_H #define ARCH_H +#include "../lib/types.h" + enum { arch_x86_64 = 1, arch_x86, diff -Nru fio-2.16/arch/arch-ia64.h fio-3.1/arch/arch-ia64.h --- fio-2.16/arch/arch-ia64.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/arch/arch-ia64.h 2017-09-28 10:23:20.000000000 +0000 @@ -28,10 +28,10 @@ } #define ARCH_HAVE_INIT -extern int tsc_reliable; +extern bool tsc_reliable; static inline int arch_init(char *envp[]) { - tsc_reliable = 1; + tsc_reliable = true; return 0; } diff -Nru fio-2.16/arch/arch-ppc.h fio-3.1/arch/arch-ppc.h --- fio-2.16/arch/arch-ppc.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/arch/arch-ppc.h 2017-09-28 10:23:20.000000000 +0000 @@ -62,7 +62,8 @@ " cmpwi %0,0;\n" " beq- 90b;\n" : "=r" (rval) - : "i" (SPRN_TBRL)); + : "i" (SPRN_TBRL) + : "cr0"); return rval; } @@ -117,12 +118,12 @@ #endif #define ARCH_HAVE_INIT -extern int tsc_reliable; +extern bool tsc_reliable; static inline int arch_init(char *envp[]) { #if 0 - tsc_reliable = 1; + tsc_reliable = true; atb_clocktest(); #endif return 0; diff -Nru fio-2.16/arch/arch-s390.h fio-3.1/arch/arch-s390.h --- fio-2.16/arch/arch-s390.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/arch/arch-s390.h 2017-09-28 10:23:20.000000000 +0000 @@ -28,10 +28,10 @@ #undef ARCH_CPU_CLOCK_WRAPS #define ARCH_HAVE_INIT -extern int tsc_reliable; +extern bool tsc_reliable; static inline int arch_init(char *envp[]) { - tsc_reliable = 1; + tsc_reliable = true; return 0; } diff -Nru fio-2.16/arch/arch-x86-common.h fio-3.1/arch/arch-x86-common.h --- fio-2.16/arch/arch-x86-common.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/arch/arch-x86-common.h 2017-09-28 10:23:20.000000000 +0000 @@ -14,7 +14,7 @@ #define ARCH_HAVE_INIT -extern int tsc_reliable; +extern bool tsc_reliable; extern int arch_random; static inline void arch_init_intel(unsigned int level) diff -Nru fio-2.16/backend.c fio-3.1/backend.c --- fio-2.16/backend.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/backend.c 2017-09-28 10:23:20.000000000 +0000 @@ -76,9 +76,6 @@ int temp_stall_ts; unsigned long done_secs = 0; -#define PAGE_ALIGN(buf) \ - (char *) (((uintptr_t) (buf) + page_mask) & ~page_mask) - #define JOB_START_TIMEOUT (5 * 1000) static void sig_int(int sig) @@ -139,7 +136,7 @@ /* * Check if we are above the minimum rate given. */ -static bool __check_min_rate(struct thread_data *td, struct timeval *now, +static bool __check_min_rate(struct thread_data *td, struct timespec *now, enum fio_ddir ddir) { unsigned long long bytes = 0; @@ -180,8 +177,8 @@ * check bandwidth specified rate */ if (bytes < td->rate_bytes[ddir]) { - log_err("%s: min rate %u not met\n", td->o.name, - ratemin); + log_err("%s: rate_min=%uB/s not met, only transferred %lluB\n", + td->o.name, ratemin, bytes); return true; } else { if (spent) @@ -191,9 +188,8 @@ if (rate < ratemin || bytes < td->rate_bytes[ddir]) { - log_err("%s: min rate %u not met, got" - " %luKB/sec\n", td->o.name, - ratemin, rate); + log_err("%s: rate_min=%uB/s not met, got %luB/s\n", + td->o.name, ratemin, rate); return true; } } @@ -202,8 +198,8 @@ * checks iops specified rate */ if (iops < rate_iops) { - log_err("%s: min iops rate %u not met\n", - td->o.name, rate_iops); + log_err("%s: rate_iops_min=%u not met, only performed %lu IOs\n", + td->o.name, rate_iops, iops); return true; } else { if (spent) @@ -213,9 +209,8 @@ if (rate < rate_iops_min || iops < td->rate_blocks[ddir]) { - log_err("%s: min iops rate %u not met," - " got %lu\n", td->o.name, - rate_iops_min, rate); + log_err("%s: rate_iops_min=%u not met, got %lu IOPS\n", + td->o.name, rate_iops_min, rate); return true; } } @@ -228,7 +223,7 @@ return false; } -static bool check_min_rate(struct thread_data *td, struct timeval *now) +static bool check_min_rate(struct thread_data *td, struct timespec *now) { bool ret = false; @@ -340,18 +335,18 @@ return ret; } -static inline void __update_tv_cache(struct thread_data *td) +static inline void __update_ts_cache(struct thread_data *td) { - fio_gettime(&td->tv_cache, NULL); + fio_gettime(&td->ts_cache, NULL); } -static inline void update_tv_cache(struct thread_data *td) +static inline void update_ts_cache(struct thread_data *td) { - if ((++td->tv_cache_nr & td->tv_cache_mask) == td->tv_cache_mask) - __update_tv_cache(td); + if ((++td->ts_cache_nr & td->ts_cache_mask) == td->ts_cache_mask) + __update_ts_cache(td); } -static inline bool runtime_exceeded(struct thread_data *td, struct timeval *t) +static inline bool runtime_exceeded(struct thread_data *td, struct timespec *t) { if (in_ramp_time(td)) return false; @@ -435,7 +430,7 @@ } } -static int wait_for_completions(struct thread_data *td, struct timeval *time) +static int wait_for_completions(struct thread_data *td, struct timespec *time) { const int full = queue_full(td); int min_evts = 0; @@ -467,7 +462,7 @@ int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret, enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify, - struct timeval *comp_time) + struct timespec *comp_time) { int ret2; @@ -592,6 +587,50 @@ } /* + * Check if io_u will overlap an in-flight IO in the queue + */ +static bool in_flight_overlap(struct io_u_queue *q, struct io_u *io_u) +{ + bool overlap; + struct io_u *check_io_u; + unsigned long long x1, x2, y1, y2; + int i; + + x1 = io_u->offset; + x2 = io_u->offset + io_u->buflen; + overlap = false; + io_u_qiter(q, check_io_u, i) { + if (check_io_u->flags & IO_U_F_FLIGHT) { + y1 = check_io_u->offset; + y2 = check_io_u->offset + check_io_u->buflen; + + if (x1 < y2 && y1 < x2) { + overlap = true; + dprint(FD_IO, "in-flight overlap: %llu/%lu, %llu/%lu\n", + x1, io_u->buflen, + y1, check_io_u->buflen); + break; + } + } + } + + return overlap; +} + +static int io_u_submit(struct thread_data *td, struct io_u *io_u) +{ + /* + * Check for overlap if the user asked us to, and we have + * at least one IO in flight besides this one. + */ + if (td->o.serialize_overlap && td->cur_depth > 1 && + in_flight_overlap(&td->io_u_all, io_u)) + return FIO_Q_BUSY; + + return td_io_queue(td, io_u); +} + +/* * The main verify engine. Runs over the writes we previously submitted, * reads the blocks back in, and checks the crc/md5 of the data. */ @@ -638,12 +677,12 @@ enum fio_ddir ddir; int full; - update_tv_cache(td); + update_ts_cache(td); check_update_rusage(td); - if (runtime_exceeded(td, &td->tv_cache)) { - __update_tv_cache(td); - if (runtime_exceeded(td, &td->tv_cache)) { + if (runtime_exceeded(td, &td->ts_cache)) { + __update_ts_cache(td); + if (runtime_exceeded(td, &td->ts_cache)) { fio_mark_td_terminate(td); break; } @@ -721,7 +760,7 @@ if (!td->o.disable_slat) fio_gettime(&io_u->start_time, NULL); - ret = td_io_queue(td, io_u); + ret = io_u_submit(td, io_u); if (io_queue_event(td, io_u, &ret, ddir, NULL, 1, NULL)) break; @@ -781,8 +820,8 @@ else bytes = this_bytes[DDIR_TRIM]; - if (td->o.io_limit) - limit = td->o.io_limit; + if (td->o.io_size) + limit = td->o.io_size; else limit = td->o.size; @@ -816,13 +855,14 @@ uint64_t val; iops = bps / td->o.bs[ddir]; val = (int64_t) (1000000 / iops) * - -logf(__rand_0_1(&td->poisson_state)); + -logf(__rand_0_1(&td->poisson_state[ddir])); if (val) { - dprint(FD_RATE, "poisson rate iops=%llu\n", - (unsigned long long) 1000000 / val); + dprint(FD_RATE, "poisson rate iops=%llu, ddir=%d\n", + (unsigned long long) 1000000 / val, + ddir); } - td->last_usec += val; - return td->last_usec; + td->last_usec[ddir] += val; + return td->last_usec[ddir]; } else if (bps) { secs = bytes / bps; remainder = bytes % bps; @@ -856,11 +896,11 @@ total_bytes = td->o.size; /* - * Allow random overwrite workloads to write up to io_limit + * Allow random overwrite workloads to write up to io_size * before starting verification phase as 'size' doesn't apply. */ if (td_write(td) && td_random(td) && td->o.norandommap) - total_bytes = max(total_bytes, (uint64_t) td->o.io_limit); + total_bytes = max(total_bytes, (uint64_t) td->o.io_size); /* * If verify_backlog is enabled, we'll run the verify in this * handler as well. For that case, we may need up to twice the @@ -878,7 +918,7 @@ while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) || (!flist_empty(&td->trim_list)) || !io_issue_bytes_exceeded(td) || td->o.time_based) { - struct timeval comp_time; + struct timespec comp_time; struct io_u *io_u; int full; enum fio_ddir ddir; @@ -888,11 +928,11 @@ if (td->terminate || td->done) break; - update_tv_cache(td); + update_ts_cache(td); - if (runtime_exceeded(td, &td->tv_cache)) { - __update_tv_cache(td); - if (runtime_exceeded(td, &td->tv_cache)) { + if (runtime_exceeded(td, &td->ts_cache)) { + __update_ts_cache(td); + if (runtime_exceeded(td, &td->ts_cache)) { fio_mark_td_terminate(td); break; } @@ -987,7 +1027,7 @@ td->rate_next_io_time[ddir] = usec_for_io(td, ddir); } else { - ret = td_io_queue(td, io_u); + ret = io_u_submit(td, io_u); if (should_check_rate(td)) td->rate_next_io_time[ddir] = usec_for_io(td, ddir); @@ -1200,7 +1240,7 @@ if (td->o.odirect || td->o.mem_align || td->o.oatomic || td_ioengine_flagged(td, FIO_RAWIO)) - p = PAGE_ALIGN(td->orig_buffer) + td->o.mem_align; + p = PTR_ALIGN(td->orig_buffer, page_mask) + td->o.mem_align; else p = td->orig_buffer; @@ -1266,6 +1306,10 @@ return 0; } +/* + * This function is Linux specific. + * FIO_HAVE_IOSCHED_SWITCH enabled currently means it's Linux. + */ static int switch_ioscheduler(struct thread_data *td) { #ifdef FIO_HAVE_IOSCHED_SWITCH @@ -1276,7 +1320,8 @@ if (td_ioengine_flagged(td, FIO_DISKLESSIO)) return 0; - sprintf(tmp, "%s/queue/scheduler", td->sysfs_root); + assert(td->files && td->files[0]); + sprintf(tmp, "%s/queue/scheduler", td->files[0]->du->sysfs_root); f = fopen(tmp, "r+"); if (!f) { @@ -1346,6 +1391,8 @@ if (td->done) return false; + if (td->terminate) + return false; if (td->o.time_based) return true; if (td->o.loops) { @@ -1355,8 +1402,8 @@ if (exceeds_number_ios(td)) return false; - if (td->o.io_limit) - limit = td->o.io_limit; + if (td->o.io_size) + limit = td->o.io_size; else limit = td->o.size; @@ -1364,14 +1411,14 @@ uint64_t diff; /* - * If the difference is less than the minimum IO size, we + * If the difference is less than the maximum IO size, we * are done. */ diff = limit - ddir_rw_sum(td->io_bytes); if (diff < td_max_bs(td)) return false; - if (fio_files_done(td) && !td->o.io_limit) + if (fio_files_done(td) && !td->o.io_size) return false; return true; @@ -1456,6 +1503,7 @@ struct thread_data *td = fd->td; struct thread_options *o = &td->o; struct sk_out *sk_out = fd->sk_out; + uint64_t bytes_done[DDIR_RWDIR_CNT]; int deadlock_loop_cnt; int clear_state; int ret; @@ -1677,12 +1725,14 @@ sizeof(td->bw_sample_time)); } + memset(bytes_done, 0, sizeof(bytes_done)); clear_state = 0; + while (keep_running(td)) { uint64_t verify_bytes; fio_gettime(&td->start, NULL); - memcpy(&td->tv_cache, &td->start, sizeof(td->start)); + memcpy(&td->ts_cache, &td->start, sizeof(td->start)); if (clear_state) { clear_io_state(td, 0); @@ -1693,11 +1743,9 @@ prune_io_piece_log(td); - if (td->o.verify_only && (td_write(td) || td_rw(td))) + if (td->o.verify_only && td_write(td)) verify_bytes = do_dry_run(td); else { - uint64_t bytes_done[DDIR_RWDIR_CNT]; - do_io(td, bytes_done); if (!ddir_rw_sum(bytes_done)) { @@ -1776,6 +1824,18 @@ break; } + /* + * If td ended up with no I/O when it should have had, + * then something went wrong unless FIO_NOIO or FIO_DISKLESSIO. + * (Are we not missing other flags that can be ignored ?) + */ + if ((td->o.size || td->o.io_size) && !ddir_rw_sum(bytes_done) && + !(td_ioengine_flagged(td, FIO_NOIO) || + td_ioengine_flagged(td, FIO_DISKLESSIO))) + log_err("%s: No I/O performed by %s, " + "perhaps try --debug=io option for details?\n", + td->o.name, td->io_ops->name); + td_set_runstate(td, TD_FINISHING); update_rusage_stat(td); @@ -1836,9 +1896,6 @@ if (o->write_iolog_file) write_iolog_close(td); - fio_mutex_remove(td->mutex); - td->mutex = NULL; - td_set_runstate(td, TD_EXITED); /* @@ -1851,14 +1908,6 @@ return (void *) (uintptr_t) td->error; } -static void dump_td_info(struct thread_data *td) -{ - log_err("fio: job '%s' (state=%d) hasn't exited in %lu seconds, it " - "appears to be stuck. Doing forceful exit of this job.\n", - td->o.name, td->runstate, - (unsigned long) time_since_now(&td->terminate_time)); -} - /* * Run over the job map and reap the threads that have exited, if any. */ @@ -1943,7 +1992,11 @@ if (td->terminate && td->runstate < TD_FSYNCING && time_since_now(&td->terminate_time) >= FIO_REAP_TIMEOUT) { - dump_td_info(td); + log_err("fio: job '%s' (state=%d) hasn't exited in " + "%lu seconds, it appears to be stuck. Doing " + "forceful exit of this job.\n", + td->o.name, td->runstate, + (unsigned long) time_since_now(&td->terminate_time)); td_set_runstate(td, TD_REAPED); goto reaped; } @@ -1991,7 +2044,10 @@ static bool trigger_timedout(void) { if (trigger_timeout) - return time_since_genesis() >= trigger_timeout; + if (time_since_genesis() >= trigger_timeout) { + trigger_timeout = 0; + return true; + } return false; } @@ -2000,7 +2056,7 @@ { int ret; - if (!cmd) + if (!cmd || cmd[0] == '\0') return; ret = system(cmd); @@ -2056,8 +2112,16 @@ if (!td_write(td) || td->o.allow_mounted_write) return false; + /* + * If FIO_HAVE_CHARDEV_SIZE is defined, it's likely that chrdevs + * are mkfs'd and mounted. + */ for_each_file(td, f, i) { - if (f->filetype != FIO_TYPE_BD) +#ifdef FIO_HAVE_CHARDEV_SIZE + if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) +#else + if (f->filetype != FIO_TYPE_BLOCK) +#endif continue; if (device_is_mounted(f->file_name)) goto mounted; @@ -2065,7 +2129,7 @@ return false; mounted: - log_err("fio: %s appears mounted, and 'allow_mounted_write' isn't set. Aborting.", f->file_name); + log_err("fio: %s appears mounted, and 'allow_mounted_write' isn't set. Aborting.\n", f->file_name); return true; } @@ -2187,7 +2251,7 @@ while (todo) { struct thread_data *map[REAL_MAX_JOBS]; - struct timeval this_start; + struct timespec this_start; int this_jobs = 0, left; struct fork_data *fd; @@ -2427,6 +2491,8 @@ fio_mutex_remove(td->rusage_sem); td->rusage_sem = NULL; } + fio_mutex_remove(td->mutex); + td->mutex = NULL; } free_disk_util(); diff -Nru fio-2.16/blktrace_api.h fio-3.1/blktrace_api.h --- fio-2.16/blktrace_api.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/blktrace_api.h 2017-09-28 10:23:20.000000000 +0000 @@ -127,9 +127,4 @@ __u32 pid; }; -#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup) -#define BLKTRACESTART _IO(0x12,116) -#define BLKTRACESTOP _IO(0x12,117) -#define BLKTRACETEARDOWN _IO(0x12,118) - #endif diff -Nru fio-2.16/blktrace.c fio-3.1/blktrace.c --- fio-2.16/blktrace.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/blktrace.c 2017-09-28 10:23:20.000000000 +0000 @@ -10,6 +10,7 @@ #include "flist.h" #include "fio.h" +#include "blktrace.h" #include "blktrace_api.h" #include "oslib/linux-dev-lookup.h" diff -Nru fio-2.16/blktrace.h fio-3.1/blktrace.h --- fio-2.16/blktrace.h 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/blktrace.h 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,23 @@ +#ifndef FIO_BLKTRACE_H +#define FIO_BLKTRACE_H + +#ifdef FIO_HAVE_BLKTRACE + +int is_blktrace(const char *, int *); +int load_blktrace(struct thread_data *, const char *, int); + +#else + +static inline int is_blktrace(const char *fname, int *need_swap) +{ + return 0; +} + +static inline int load_blktrace(struct thread_data *td, const char *fname, + int need_swap) +{ + return 1; +} + +#endif +#endif diff -Nru fio-2.16/cconv.c fio-3.1/cconv.c --- fio-2.16/cconv.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/cconv.c 2017-09-28 10:23:20.000000000 +0000 @@ -88,7 +88,7 @@ o->td_ddir = le32_to_cpu(top->td_ddir); o->rw_seq = le32_to_cpu(top->rw_seq); o->kb_base = le32_to_cpu(top->kb_base); - o->unit_base = le32_to_cpu(top->kb_base); + o->unit_base = le32_to_cpu(top->unit_base); o->ddir_seq_nr = le32_to_cpu(top->ddir_seq_nr); o->ddir_seq_add = le64_to_cpu(top->ddir_seq_add); o->iodepth = le32_to_cpu(top->iodepth); @@ -96,14 +96,16 @@ o->iodepth_batch = le32_to_cpu(top->iodepth_batch); o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min); o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max); + o->serialize_overlap = le32_to_cpu(top->serialize_overlap); o->size = le64_to_cpu(top->size); - o->io_limit = le64_to_cpu(top->io_limit); + o->io_size = le64_to_cpu(top->io_size); o->size_percent = le32_to_cpu(top->size_percent); o->fill_device = le32_to_cpu(top->fill_device); o->file_append = le32_to_cpu(top->file_append); o->file_size_low = le64_to_cpu(top->file_size_low); o->file_size_high = le64_to_cpu(top->file_size_high); o->start_offset = le64_to_cpu(top->start_offset); + o->start_offset_percent = le32_to_cpu(top->start_offset_percent); for (i = 0; i < DDIR_RWDIR_CNT; i++) { o->bs[i] = le32_to_cpu(top->bs[i]); @@ -155,6 +157,7 @@ o->end_fsync = le32_to_cpu(top->end_fsync); o->pre_read = le32_to_cpu(top->pre_read); o->sync_io = le32_to_cpu(top->sync_io); + o->write_hint = le32_to_cpu(top->write_hint); o->verify = le32_to_cpu(top->verify); o->do_verify = le32_to_cpu(top->do_verify); o->verifysort = le32_to_cpu(top->verifysort); @@ -235,6 +238,7 @@ o->new_group = le32_to_cpu(top->new_group); o->numjobs = le32_to_cpu(top->numjobs); o->cpus_allowed_policy = le32_to_cpu(top->cpus_allowed_policy); + o->gpu_dev_id = le32_to_cpu(top->gpu_dev_id); o->iolog = le32_to_cpu(top->iolog); o->rwmixcycle = le32_to_cpu(top->rwmixcycle); o->nice = le32_to_cpu(top->nice); @@ -242,6 +246,7 @@ o->ioprio_class = le32_to_cpu(top->ioprio_class); o->file_service_type = le32_to_cpu(top->file_service_type); o->group_reporting = le32_to_cpu(top->group_reporting); + o->stats = le32_to_cpu(top->stats); o->fadvise_hint = le32_to_cpu(top->fadvise_hint); o->fallocate_mode = le32_to_cpu(top->fallocate_mode); o->zero_buffers = le32_to_cpu(top->zero_buffers); @@ -262,6 +267,7 @@ o->trim_batch = le32_to_cpu(top->trim_batch); o->trim_zero = le32_to_cpu(top->trim_zero); o->clat_percentiles = le32_to_cpu(top->clat_percentiles); + o->lat_percentiles = le32_to_cpu(top->lat_percentiles); o->percentile_precision = le32_to_cpu(top->percentile_precision); o->continue_on_error = le32_to_cpu(top->continue_on_error); o->cgroup_weight = le32_to_cpu(top->cgroup_weight); @@ -279,7 +285,6 @@ o->compress_percentage = le32_to_cpu(top->compress_percentage); o->compress_chunk = le32_to_cpu(top->compress_chunk); o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage); - o->skip_bad = le32_to_cpu(top->skip_bad); o->block_error_hist = le32_to_cpu(top->block_error_hist); o->replay_align = le32_to_cpu(top->replay_align); o->replay_scale = le32_to_cpu(top->replay_scale); @@ -336,13 +341,14 @@ top->td_ddir = cpu_to_le32(o->td_ddir); top->rw_seq = cpu_to_le32(o->rw_seq); top->kb_base = cpu_to_le32(o->kb_base); - top->unit_base = cpu_to_le32(o->kb_base); + top->unit_base = cpu_to_le32(o->unit_base); top->ddir_seq_nr = cpu_to_le32(o->ddir_seq_nr); top->iodepth = cpu_to_le32(o->iodepth); top->iodepth_low = cpu_to_le32(o->iodepth_low); top->iodepth_batch = cpu_to_le32(o->iodepth_batch); top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min); top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max); + top->serialize_overlap = cpu_to_le32(o->serialize_overlap); top->size_percent = cpu_to_le32(o->size_percent); top->fill_device = cpu_to_le32(o->fill_device); top->file_append = cpu_to_le32(o->file_append); @@ -362,6 +368,7 @@ top->end_fsync = cpu_to_le32(o->end_fsync); top->pre_read = cpu_to_le32(o->pre_read); top->sync_io = cpu_to_le32(o->sync_io); + top->write_hint = cpu_to_le32(o->write_hint); top->verify = cpu_to_le32(o->verify); top->do_verify = cpu_to_le32(o->do_verify); top->verifysort = cpu_to_le32(o->verifysort); @@ -419,6 +426,7 @@ top->new_group = cpu_to_le32(o->new_group); top->numjobs = cpu_to_le32(o->numjobs); top->cpus_allowed_policy = cpu_to_le32(o->cpus_allowed_policy); + top->gpu_dev_id = cpu_to_le32(o->gpu_dev_id); top->iolog = cpu_to_le32(o->iolog); top->rwmixcycle = cpu_to_le32(o->rwmixcycle); top->nice = cpu_to_le32(o->nice); @@ -426,6 +434,7 @@ top->ioprio_class = cpu_to_le32(o->ioprio_class); top->file_service_type = cpu_to_le32(o->file_service_type); top->group_reporting = cpu_to_le32(o->group_reporting); + top->stats = cpu_to_le32(o->stats); top->fadvise_hint = cpu_to_le32(o->fadvise_hint); top->fallocate_mode = cpu_to_le32(o->fallocate_mode); top->zero_buffers = cpu_to_le32(o->zero_buffers); @@ -446,6 +455,7 @@ top->trim_batch = cpu_to_le32(o->trim_batch); top->trim_zero = cpu_to_le32(o->trim_zero); top->clat_percentiles = cpu_to_le32(o->clat_percentiles); + top->lat_percentiles = cpu_to_le32(o->lat_percentiles); top->percentile_precision = cpu_to_le32(o->percentile_precision); top->continue_on_error = cpu_to_le32(o->continue_on_error); top->cgroup_weight = cpu_to_le32(o->cgroup_weight); @@ -464,7 +474,6 @@ top->compress_chunk = cpu_to_le32(o->compress_chunk); top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage); top->block_error_hist = cpu_to_le32(o->block_error_hist); - top->skip_bad = cpu_to_le32(o->skip_bad); top->replay_align = cpu_to_le32(o->replay_align); top->replay_scale = cpu_to_le32(o->replay_scale); top->per_job_logs = cpu_to_le32(o->per_job_logs); @@ -521,7 +530,7 @@ memcpy(top->buffer_pattern, o->buffer_pattern, MAX_PATTERN_SIZE); top->size = __cpu_to_le64(o->size); - top->io_limit = __cpu_to_le64(o->io_limit); + top->io_size = __cpu_to_le64(o->io_size); top->verify_backlog = __cpu_to_le64(o->verify_backlog); top->start_delay = __cpu_to_le64(o->start_delay); top->start_delay_high = __cpu_to_le64(o->start_delay_high); @@ -539,6 +548,7 @@ top->file_size_low = __cpu_to_le64(o->file_size_low); top->file_size_high = __cpu_to_le64(o->file_size_high); top->start_offset = __cpu_to_le64(o->start_offset); + top->start_offset_percent = __cpu_to_le32(o->start_offset_percent); top->trim_backlog = __cpu_to_le64(o->trim_backlog); top->offset_increment = __cpu_to_le64(o->offset_increment); top->number_ios = __cpu_to_le64(o->number_ios); diff -Nru fio-2.16/client.c fio-3.1/client.c --- fio-2.16/client.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/client.c 2017-09-28 10:23:20.000000000 +0000 @@ -48,7 +48,7 @@ .client_type = FIO_CLIENT_TYPE_CLI, }; -static struct timeval eta_tv; +static struct timespec eta_ts; static FLIST_HEAD(client_list); static FLIST_HEAD(eta_list); @@ -318,7 +318,7 @@ client->hostname = strdup(hostname); if (type == Fio_client_socket) - client->is_sock = 1; + client->is_sock = true; else { int ipv6; @@ -728,7 +728,7 @@ strcpy((char *) pdu->file, filename); pdu->client_type = cpu_to_le16((uint16_t) client->type); - client->sent_job = 1; + client->sent_job = true; ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_LOAD_FILE, pdu, p_size,NULL, NULL); free(pdu); return ret; @@ -781,7 +781,7 @@ pdu->buf_len = __cpu_to_le32(sb.st_size); pdu->client_type = cpu_to_le32(client->type); - client->sent_job = 1; + client->sent_job = true; ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_JOB, pdu, p_size, NULL, NULL); free(pdu); close(fd); @@ -799,7 +799,7 @@ ret = __fio_client_send_remote_ini(client, filename); if (!ret) - client->sent_job = 1; + client->sent_job = true; return ret; } @@ -885,6 +885,7 @@ convert_io_stat(&dst->slat_stat[i], &src->slat_stat[i]); convert_io_stat(&dst->lat_stat[i], &src->lat_stat[i]); convert_io_stat(&dst->bw_stat[i], &src->bw_stat[i]); + convert_io_stat(&dst->iops_stat[i], &src->iops_stat[i]); } dst->usr_time = le64_to_cpu(src->usr_time); @@ -892,7 +893,8 @@ dst->ctx = le64_to_cpu(src->ctx); dst->minf = le64_to_cpu(src->minf); dst->majf = le64_to_cpu(src->majf); - dst->clat_percentiles = le64_to_cpu(src->clat_percentiles); + dst->clat_percentiles = le32_to_cpu(src->clat_percentiles); + dst->lat_percentiles = le32_to_cpu(src->lat_percentiles); dst->percentile_precision = le64_to_cpu(src->percentile_precision); for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) { @@ -908,10 +910,12 @@ dst->io_u_complete[i] = le32_to_cpu(src->io_u_complete[i]); } - for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) { + for (i = 0; i < FIO_IO_U_LAT_N_NR; i++) + dst->io_u_lat_n[i] = le32_to_cpu(src->io_u_lat_n[i]); + for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) dst->io_u_lat_u[i] = le32_to_cpu(src->io_u_lat_u[i]); + for (i = 0; i < FIO_IO_U_LAT_M_NR; i++) dst->io_u_lat_m[i] = le32_to_cpu(src->io_u_lat_m[i]); - } for (i = 0; i < DDIR_RWDIR_CNT; i++) for (j = 0; j < FIO_IO_U_PLAT_NR; j++) @@ -972,7 +976,7 @@ dst->min_run[i] = le64_to_cpu(src->min_run[i]); dst->max_bw[i] = le64_to_cpu(src->max_bw[i]); dst->min_bw[i] = le64_to_cpu(src->min_bw[i]); - dst->io_kb[i] = le64_to_cpu(src->io_kb[i]); + dst->iobytes[i] = le64_to_cpu(src->iobytes[i]); dst->agg[i] = le64_to_cpu(src->agg[i]); } @@ -1001,7 +1005,7 @@ opt_list = &client->opt_lists[p->ts.thread_number - 1]; tsobj = show_thread_status(&p->ts, &p->rs, opt_list, NULL); - client->did_stat = 1; + client->did_stat = true; if (tsobj) { json_object_add_client_info(tsobj, client); json_array_add_value_object(clients_array, tsobj); @@ -1123,7 +1127,7 @@ struct cmd_du_pdu *du = (struct cmd_du_pdu *) cmd->payload; if (!client->disk_stats_shown) { - client->disk_stats_shown = 1; + client->disk_stats_shown = true; log_info("\nDisk stats (read/write):\n"); } @@ -1322,7 +1326,7 @@ log_pathname = malloc(10 + strlen((char *)pdu->name) + strlen(client->hostname)); if (!log_pathname) { - log_err("fio: memory allocation of unique pathname failed"); + log_err("fio: memory allocation of unique pathname failed\n"); return -1; } /* generate a unique pathname for the log file using hostname */ @@ -1450,7 +1454,7 @@ z_stream stream; uint32_t nr_samples; size_t total; - void *p; + char *p; stream.zalloc = Z_NULL; stream.zfree = Z_NULL; @@ -1476,10 +1480,10 @@ memcpy(ret, pdu, sizeof(*pdu)); - p = (void *) ret + sizeof(*pdu); + p = (char *) ret + sizeof(*pdu); stream.avail_in = cmd->pdu_len - sizeof(*pdu); - stream.next_in = (void *) pdu + sizeof(*pdu); + stream.next_in = (void *)((char *) pdu + sizeof(*pdu)); while (stream.avail_in) { unsigned int this_chunk = 65536; unsigned int this_len; @@ -1489,7 +1493,7 @@ this_chunk = total; stream.avail_out = this_chunk; - stream.next_out = p; + stream.next_out = (void *)p; err = inflate(&stream, Z_NO_FLUSH); /* may be Z_OK, or Z_STREAM_END */ if (err < 0) { @@ -1564,7 +1568,7 @@ s = __get_sample(samples, ret->log_offset, i); if (ret->log_type == IO_LOG_TYPE_HIST) - s = (struct io_sample *)((void *)s + sizeof(struct io_u_plat_entry) * i); + s = (struct io_sample *)((char *)s + sizeof(struct io_u_plat_entry) * i); s->time = le64_to_cpu(s->time); s->data.val = le64_to_cpu(s->data.val); @@ -1578,7 +1582,7 @@ } if (ret->log_type == IO_LOG_TYPE_HIST) { - s->data.plat_entry = (struct io_u_plat_entry *)(((void *)s) + sizeof(*s)); + s->data.plat_entry = (struct io_u_plat_entry *)(((char *)s) + sizeof(*s)); s->data.plat_entry->list.next = NULL; s->data.plat_entry->list.prev = NULL; } @@ -1869,7 +1873,7 @@ } static int client_check_cmd_timeout(struct fio_client *client, - struct timeval *now) + struct timespec *now) { struct fio_net_cmd_reply *reply; struct flist_head *entry, *tmp; @@ -1878,7 +1882,7 @@ flist_for_each_safe(entry, tmp, &client->cmd_list) { reply = flist_entry(entry, struct fio_net_cmd_reply, list); - if (mtime_since(&reply->tv, now) < FIO_NET_CLIENT_TIMEOUT) + if (mtime_since(&reply->ts, now) < FIO_NET_CLIENT_TIMEOUT) continue; if (!handle_cmd_timeout(client, reply)) @@ -1896,10 +1900,10 @@ { struct fio_client *client; struct flist_head *entry, *tmp; - struct timeval tv; + struct timespec ts; int ret = 0; - fio_gettime(&tv, NULL); + fio_gettime(&ts, NULL); flist_for_each_safe(entry, tmp, &client_list) { client = flist_entry(entry, struct fio_client, list); @@ -1907,7 +1911,7 @@ if (flist_empty(&client->cmd_list)) continue; - if (!client_check_cmd_timeout(client, &tv)) + if (!client_check_cmd_timeout(client, &ts)) continue; if (client->ops->timed_out) @@ -1928,7 +1932,7 @@ struct pollfd *pfds; int i, ret = 0, retval = 0; - fio_gettime(&eta_tv, NULL); + fio_gettime(&eta_ts, NULL); pfds = malloc(nr_clients * sizeof(struct pollfd)); @@ -1960,13 +1964,13 @@ assert(i == nr_clients); do { - struct timeval tv; + struct timespec ts; int timeout; - fio_gettime(&tv, NULL); - if (mtime_since(&eta_tv, &tv) >= 900) { + fio_gettime(&ts, NULL); + if (mtime_since(&eta_ts, &ts) >= 900) { request_client_etas(ops); - memcpy(&eta_tv, &tv, sizeof(tv)); + memcpy(&eta_ts, &ts, sizeof(ts)); if (fio_check_clients_timed_out()) break; diff -Nru fio-2.16/client.h fio-3.1/client.h --- fio-2.16/client.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/client.h 2017-09-28 10:23:20.000000000 +0000 @@ -6,6 +6,7 @@ #include #include +#include "lib/types.h" #include "stat.h" struct fio_net_cmd; @@ -45,16 +46,16 @@ int state; - int skip_newline; - int is_sock; - int disk_stats_shown; + bool skip_newline; + bool is_sock; + bool disk_stats_shown; unsigned int jobs; unsigned int nr_stat; int error; int signal; int ipv6; - int sent_job; - int did_stat; + bool sent_job; + bool did_stat; uint32_t type; uint32_t thread_number; diff -Nru fio-2.16/compiler/compiler.h fio-3.1/compiler/compiler.h --- fio-2.16/compiler/compiler.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/compiler/compiler.h 2017-09-28 10:23:20.000000000 +0000 @@ -38,10 +38,12 @@ #if defined(CONFIG_STATIC_ASSERT) #define compiletime_assert(condition, msg) _Static_assert(condition, msg) -#else +#elif !defined(CONFIG_DISABLE_OPTIMIZATIONS) + #ifndef __compiletime_error #define __compiletime_error(message) #endif + #ifndef __compiletime_error_fallback #define __compiletime_error_fallback(condition) do { } while (0) #endif @@ -61,6 +63,10 @@ #define compiletime_assert(condition, msg) \ _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__) +#else + +#define compiletime_assert(condition, msg) do { } while (0) + #endif #endif diff -Nru fio-2.16/configure fio-3.1/configure --- fio-2.16/configure 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/configure 2017-09-28 10:23:20.000000000 +0000 @@ -37,6 +37,11 @@ exit 1 } +# Print result for each configuration test +print_config() { + printf "%-30s%s\n" "$1" "$2" +} + # Default CFLAGS CFLAGS="-D_GNU_SOURCE -include config-host.h" BUILD_CFLAGS="" @@ -138,6 +143,7 @@ pmemblk="no" devdax="no" disable_lex="" +disable_pmem="no" prefix=/usr/local # parse options @@ -160,11 +166,12 @@ ;; --build-static) build_static="yes" ;; - --enable-gfio) - gfio_check="yes" + --enable-gfio) gfio_check="yes" ;; --disable-numa) disable_numa="yes" ;; + --disable-rdma) disable_rdma="yes" + ;; --disable-rbd) disable_rbd="yes" ;; --disable-rbd-blkin) disable_rbd_blkin="yes" @@ -173,10 +180,6 @@ ;; --enable-libhdfs) libhdfs="yes" ;; - --enable-pmemblk) pmemblk="yes" - ;; - --enable-devdax) devdax="yes" - ;; --disable-lex) disable_lex="yes" ;; --enable-lex) disable_lex="no" @@ -185,6 +188,10 @@ ;; --disable-optimizations) disable_opt="yes" ;; + --disable-pmem) disable_pmem="yes" + ;; + --enable-cuda) enable_cuda="yes" + ;; --help) show_help="yes" ;; @@ -196,23 +203,24 @@ done if test "$show_help" = "yes" ; then - echo "--prefix= Use this directory as installation prefix" - echo "--cpu= Specify target CPU if auto-detect fails" - echo "--cc= Specify compiler to use" - echo "--extra-cflags= Specify extra CFLAGS to pass to compiler" - echo "--build-32bit-win Enable 32-bit build on Windows" - echo "--build-static Build a static fio" - echo "--esx Configure build options for esx" - echo "--enable-gfio Enable building of gtk gfio" - echo "--disable-numa Disable libnuma even if found" - echo "--disable-gfapi Disable gfapi" - echo "--enable-libhdfs Enable hdfs support" - echo "--enable-pmemblk Enable NVML libpmemblk support" - echo "--enable-devdax Enable NVM Device Dax support" - echo "--disable-lex Disable use of lex/yacc for math" - echo "--enable-lex Enable use of lex/yacc for math" - echo "--disable-shm Disable SHM support" + echo "--prefix= Use this directory as installation prefix" + echo "--cpu= Specify target CPU if auto-detect fails" + echo "--cc= Specify compiler to use" + echo "--extra-cflags= Specify extra CFLAGS to pass to compiler" + echo "--build-32bit-win Enable 32-bit build on Windows" + echo "--build-static Build a static fio" + echo "--esx Configure build options for esx" + echo "--enable-gfio Enable building of gtk gfio" + echo "--disable-numa Disable libnuma even if found" + echo "--disable-rdma Disable RDMA support even if found" + echo "--disable-gfapi Disable gfapi" + echo "--enable-libhdfs Enable hdfs support" + echo "--disable-lex Disable use of lex/yacc for math" + echo "--disable-pmem Disable pmem based engines even if found" + echo "--enable-lex Enable use of lex/yacc for math" + echo "--disable-shm Disable SHM support" echo "--disable-optimizations Don't enable compiler optimizations" + echo "--enable-cuda Enable GPUDirect RDMA support" exit $exit_val fi @@ -253,8 +261,9 @@ # cross-compiling to one of these OSes then you'll need to specify # the correct CPU with the --cpu option. case $targetos in -AIX) +AIX|OpenBSD) # Unless explicitly enabled, turn off lex. + # OpenBSD will hit syntax error when enabled. if test -z "$disable_lex" ; then disable_lex="yes" else @@ -270,6 +279,17 @@ if test -z "$cpu" && test "$(sysctl -n hw.optional.x86_64)" = "1"; then cpu="x86_64" fi + # Error at compile time linking of weak/partial symbols if possible... +cat > $TMPC <> $config_host_mak + fi else CC="x86_64-w64-mingw32-gcc" + if test -e "../zlib/contrib/vstudio/vc14/x64/ZlibStatReleaseWithoutAsm/zlibstat.lib"; then + echo "Building with zlib support" + output_sym "CONFIG_ZLIB" + echo "LIBS=../zlib/contrib/vstudio/vc14/x64/ZlibStatReleaseWithoutAsm/zlibstat.lib" >> $config_host_mak + fi fi fi - output_sym "CONFIG_LITTLE_ENDIAN" if test ! -z "$build_32bit_win" && test "$build_32bit_win" = "yes"; then output_sym "CONFIG_32BIT" else output_sym "CONFIG_64BIT_LLP64" fi - output_sym "CONFIG_FADVISE" - output_sym "CONFIG_SOCKLEN_T" - output_sym "CONFIG_FADVISE" - output_sym "CONFIG_SFAA" - output_sym "CONFIG_RUSAGE_THREAD" + # We need this to be output_sym'd here because this is Windows specific. + # The regular configure path never sets this config. output_sym "CONFIG_WINDOWSAIO" - output_sym "CONFIG_FDATASYNC" - output_sym "CONFIG_CLOCK_MONOTONIC" - output_sym "CONFIG_GETTIMEOFDAY" - output_sym "CONFIG_CLOCK_GETTIME" - output_sym "CONFIG_SCHED_IDLE" - output_sym "CONFIG_TCP_NODELAY" - output_sym "CONFIG_TLS_THREAD" - output_sym "CONFIG_IPV6" + # We now take the regular configuration path without having exit 0 here. + # Flags below are still necessary mostly for MinGW. + socklen_t="yes" + sfaa="yes" + rusage_thread="yes" + fdatasync="yes" + clock_gettime="yes" # clock_monotonic probe has dependency on this + clock_monotonic="yes" + gettimeofday="yes" + sched_idle="yes" + tcp_nodelay="yes" + tls_thread="yes" + static_assert="yes" + ipv6="yes" echo "CC=$CC" >> $config_host_mak - echo "BUILD_CFLAGS=$CFLAGS -include config-host.h -D_GNU_SOURCE" >> $config_host_mak - exit 0 + echo "BUILD_CFLAGS=$CFLAGS -I../zlib -include config-host.h -D_GNU_SOURCE" >> $config_host_mak ;; esac @@ -344,6 +376,8 @@ fi elif check_define __arm__ ; then cpu="arm" +elif check_define __aarch64__ ; then + cpu="aarch64" elif check_define __hppa__ ; then cpu="hppa" else @@ -356,7 +390,7 @@ cpu="$cpu" ;; i386|i486|i586|i686|i86pc|BePC) - cpu="i386" + cpu="x86" ;; x86_64|amd64) cpu="x86_64" @@ -364,6 +398,9 @@ armv*b|armv*l|arm) cpu="arm" ;; + aarch64) + cpu="arm64" + ;; hppa|parisc|parisc64) cpu="hppa" ;; @@ -393,7 +430,9 @@ ########################################## # check cross compile -cross_compile="no" +if test "$cross_compile" != "yes" ; then + cross_compile="no" +fi cat > $TMPC < $TMPC < @@ -439,11 +480,11 @@ fi -echo "Operating system $targetos" -echo "CPU $cpu" -echo "Big endian $bigendian" -echo "Compiler $cc" -echo "Cross compile $cross_compile" +print_config "Operating system" "$targetos" +print_config "CPU" "$cpu" +print_config "Big endian" "$bigendian" +print_config "Compiler" "$cc" +print_config "Cross compile" "$cross_compile" echo ########################################## @@ -454,7 +495,7 @@ else build_static="no" fi -echo "Static build $build_static" +print_config "Static build" "$build_static" ########################################## # check for wordsize @@ -475,11 +516,13 @@ else fatal "Unknown wordsize" fi -echo "Wordsize $wordsize" +print_config "Wordsize" "$wordsize" ########################################## # zlib probe -zlib="no" +if test "$zlib" != "yes" ; then + zlib="no" +fi cat > $TMPC < int main(void) @@ -494,11 +537,13 @@ zlib=yes LIBS="-lz $LIBS" fi -echo "zlib $zlib" +print_config "zlib" "$zlib" ########################################## # linux-aio probe -libaio="no" +if test "$libaio" != "yes" ; then + libaio="no" +fi if test "$esx" != "yes" ; then cat > $TMPC < @@ -519,12 +564,16 @@ libaio=no fi fi -echo "Linux AIO support $libaio" +print_config "Linux AIO support" "$libaio" ########################################## # posix aio probe -posix_aio="no" -posix_aio_lrt="no" +if test "$posix_aio" != "yes" ; then + posix_aio="no" +fi +if test "$posix_aio_lrt" != "yes" ; then + posix_aio_lrt="no" +fi cat > $TMPC < int main(void) @@ -541,12 +590,14 @@ posix_aio_lrt="yes" LIBS="-lrt $LIBS" fi -echo "POSIX AIO support $posix_aio" -echo "POSIX AIO support needs -lrt $posix_aio_lrt" +print_config "POSIX AIO support" "$posix_aio" +print_config "POSIX AIO support needs -lrt" "$posix_aio_lrt" ########################################## # posix aio fsync probe -posix_aio_fsync="no" +if test "$posix_aio_fsync" != "yes" ; then + posix_aio_fsync="no" +fi if test "$posix_aio" = "yes" ; then cat > $TMPC < @@ -562,11 +613,43 @@ posix_aio_fsync=yes fi fi -echo "POSIX AIO fsync $posix_aio_fsync" +print_config "POSIX AIO fsync" "$posix_aio_fsync" + +########################################## +# POSIX pshared attribute probe +if test "$posix_pshared" != "yes" ; then + posix_pshared="no" +fi +cat > $TMPC < +int main(void) +{ +#if defined(_POSIX_THREAD_PROCESS_SHARED) && ((_POSIX_THREAD_PROCESS_SHARED + 0) > 0) +# if defined(__CYGWIN__) +# error "_POSIX_THREAD_PROCESS_SHARED is buggy on Cygwin" +# elif defined(__APPLE__) +# include +# include +# if TARGET_OS_MAC && MAC_OS_X_VERSION_MIN_REQUIRED < 1070 +# error "_POSIX_THREAD_PROCESS_SHARED is buggy/unsupported prior to OSX 10.7" +# endif +# endif +#else +# error "_POSIX_THREAD_PROCESS_SHARED is unsupported" +#endif + return 0; +} +EOF +if compile_prog "" "$LIBS" "posix_pshared" ; then + posix_pshared=yes +fi +print_config "POSIX pshared support" "$posix_pshared" ########################################## # solaris aio probe -solaris_aio="no" +if test "$solaris_aio" != "yes" ; then + solaris_aio="no" +fi cat > $TMPC < #include @@ -582,11 +665,13 @@ solaris_aio=yes LIBS="-laio $LIBS" fi -echo "Solaris AIO support $solaris_aio" +print_config "Solaris AIO support" "$solaris_aio" ########################################## # __sync_fetch_and_add test -sfaa="no" +if test "$sfaa" != "yes" ; then + sfaa="no" +fi cat > $TMPC << EOF #include static int sfaa(uint64_t *ptr) @@ -604,29 +689,32 @@ if compile_prog "" "" "__sync_fetch_and_add()" ; then sfaa="yes" fi -echo "__sync_fetch_and_add $sfaa" +print_config "__sync_fetch_and_add" "$sfaa" ########################################## # libverbs probe -libverbs="no" +if test "$libverbs" != "yes" ; then + libverbs="no" +fi cat > $TMPC << EOF -#include -#include +#include int main(int argc, char **argv) { struct ibv_pd *pd = ibv_alloc_pd(NULL); return 0; } EOF -if compile_prog "" "-libverbs" "libverbs" ; then +if test "$disable_rdma" != "yes" && compile_prog "" "-libverbs" "libverbs" ; then libverbs="yes" LIBS="-libverbs $LIBS" fi -echo "libverbs $libverbs" +print_config "libverbs" "$libverbs" ########################################## # rdmacm probe -rdmacm="no" +if test "$rdmacm" != "yes" ; then + rdmacm="no" +fi cat > $TMPC << EOF #include #include @@ -636,15 +724,17 @@ return 0; } EOF -if compile_prog "" "-lrdmacm" "rdma"; then +if test "$disable_rdma" != "yes" && compile_prog "" "-lrdmacm" "rdma"; then rdmacm="yes" LIBS="-lrdmacm $LIBS" fi -echo "rdmacm $rdmacm" +print_config "rdmacm" "$rdmacm" ########################################## # Linux fallocate probe -linux_fallocate="no" +if test "$linux_fallocate" != "yes" ; then + linux_fallocate="no" +fi cat > $TMPC << EOF #include #include @@ -658,11 +748,13 @@ if compile_prog "" "" "linux_fallocate"; then linux_fallocate="yes" fi -echo "Linux fallocate $linux_fallocate" +print_config "Linux fallocate" "$linux_fallocate" ########################################## # POSIX fadvise probe -posix_fadvise="no" +if test "$posix_fadvise" != "yes" ; then + posix_fadvise="no" +fi cat > $TMPC << EOF #include #include @@ -675,11 +767,13 @@ if compile_prog "" "" "posix_fadvise"; then posix_fadvise="yes" fi -echo "POSIX fadvise $posix_fadvise" +print_config "POSIX fadvise" "$posix_fadvise" ########################################## # POSIX fallocate probe -posix_fallocate="no" +if test "$posix_fallocate" != "yes" ; then + posix_fallocate="no" +fi cat > $TMPC << EOF #include #include @@ -692,12 +786,16 @@ if compile_prog "" "" "posix_fallocate"; then posix_fallocate="yes" fi -echo "POSIX fallocate $posix_fallocate" +print_config "POSIX fallocate" "$posix_fallocate" ########################################## # sched_set/getaffinity 2 or 3 argument test -linux_2arg_affinity="no" -linux_3arg_affinity="no" +if test "$linux_2arg_affinity" != "yes" ; then + linux_2arg_affinity="no" +fi +if test "$linux_3arg_affinity" != "yes" ; then + linux_3arg_affinity="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) @@ -721,12 +819,14 @@ linux_2arg_affinity="yes" fi fi -echo "sched_setaffinity(3 arg) $linux_3arg_affinity" -echo "sched_setaffinity(2 arg) $linux_2arg_affinity" +print_config "sched_setaffinity(3 arg)" "$linux_3arg_affinity" +print_config "sched_setaffinity(2 arg)" "$linux_2arg_affinity" ########################################## # clock_gettime probe -clock_gettime="no" +if test "$clock_gettime" != "yes" ; then + clock_gettime="no" +fi cat > $TMPC << EOF #include #include @@ -741,11 +841,13 @@ clock_gettime="yes" LIBS="-lrt $LIBS" fi -echo "clock_gettime $clock_gettime" +print_config "clock_gettime" "$clock_gettime" ########################################## # CLOCK_MONOTONIC probe -clock_monotonic="no" +if test "$clock_monotonic" != "yes" ; then + clock_monotonic="no" +fi if test "$clock_gettime" = "yes" ; then cat > $TMPC << EOF #include @@ -759,11 +861,13 @@ clock_monotonic="yes" fi fi -echo "CLOCK_MONOTONIC $clock_monotonic" +print_config "CLOCK_MONOTONIC" "$clock_monotonic" ########################################## # CLOCK_MONOTONIC_RAW probe -clock_monotonic_raw="no" +if test "$clock_monotonic_raw" != "yes" ; then + clock_monotonic_raw="no" +fi if test "$clock_gettime" = "yes" ; then cat > $TMPC << EOF #include @@ -777,11 +881,13 @@ clock_monotonic_raw="yes" fi fi -echo "CLOCK_MONOTONIC_RAW $clock_monotonic_raw" +print_config "CLOCK_MONOTONIC_RAW" "$clock_monotonic_raw" ########################################## # CLOCK_MONOTONIC_PRECISE probe -clock_monotonic_precise="no" +if test "$clock_monotonic_precise" != "yes" ; then + clock_monotonic_precise="no" +fi if test "$clock_gettime" = "yes" ; then cat > $TMPC << EOF #include @@ -795,30 +901,33 @@ clock_monotonic_precise="yes" fi fi -echo "CLOCK_MONOTONIC_PRECISE $clock_monotonic_precise" +print_config "CLOCK_MONOTONIC_PRECISE" "$clock_monotonic_precise" ########################################## # clockid_t probe -clockid_t="no" +if test "$clockid_t" != "yes" ; then + clockid_t="no" +fi cat > $TMPC << EOF -#include -#include #include +#include int main(int argc, char **argv) { - clockid_t cid; - memset(&cid, 0, sizeof(cid)); - return clock_gettime(cid, NULL); + volatile clockid_t cid; + memset((void*)&cid, 0, sizeof(cid)); + return 0; } EOF if compile_prog "" "$LIBS" "clockid_t"; then clockid_t="yes" fi -echo "clockid_t $clockid_t" +print_config "clockid_t" "$clockid_t" ########################################## # gettimeofday() probe -gettimeofday="no" +if test "$gettimeofday" != "yes" ; then + gettimeofday="no" +fi cat > $TMPC << EOF #include #include @@ -831,11 +940,13 @@ if compile_prog "" "" "gettimeofday"; then gettimeofday="yes" fi -echo "gettimeofday $gettimeofday" +print_config "gettimeofday" "$gettimeofday" ########################################## # fdatasync() probe -fdatasync="no" +if test "$fdatasync" != "yes" ; then + fdatasync="no" +fi cat > $TMPC << EOF #include #include @@ -847,11 +958,13 @@ if compile_prog "" "" "fdatasync"; then fdatasync="yes" fi -echo "fdatasync $fdatasync" +print_config "fdatasync" "$fdatasync" ########################################## # sync_file_range() probe -sync_file_range="no" +if test "$sync_file_range" != "yes" ; then + sync_file_range="no" +fi cat > $TMPC << EOF #include #include @@ -867,11 +980,13 @@ if compile_prog "" "" "sync_file_range"; then sync_file_range="yes" fi -echo "sync_file_range $sync_file_range" +print_config "sync_file_range" "$sync_file_range" ########################################## # ext4 move extent probe -ext4_me="no" +if test "$ext4_me" != "yes" ; then + ext4_me="no" +fi cat > $TMPC << EOF #include #include @@ -889,11 +1004,13 @@ # work. Takes a while to bubble back. ext4_me="yes" fi -echo "EXT4 move extent $ext4_me" +print_config "EXT4 move extent" "$ext4_me" ########################################## # splice probe -linux_splice="no" +if test "$linux_splice" != "yes" ; then + linux_splice="no" +fi cat > $TMPC << EOF #include #include @@ -905,11 +1022,13 @@ if compile_prog "" "" "linux splice"; then linux_splice="yes" fi -echo "Linux splice(2) $linux_splice" +print_config "Linux splice(2)" "$linux_splice" ########################################## # GUASI probe -guasi="no" +if test "$guasi" != "yes" ; then + guasi="no" +fi cat > $TMPC << EOF #include #include @@ -922,11 +1041,13 @@ if compile_prog "" "" "guasi"; then guasi="yes" fi -echo "GUASI $guasi" +print_config "GUASI" "$guasi" ########################################## # fusion-aw probe -fusion_aw="no" +if test "$fusion_aw" != "yes" ; then + fusion_aw="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) @@ -942,11 +1063,13 @@ LIBS="-L/usr/lib/fio -L/usr/lib/nvm -lnvm-primitives -ldl -lpthread $LIBS" fusion_aw="yes" fi -echo "Fusion-io atomic engine $fusion_aw" +print_config "Fusion-io atomic engine" "$fusion_aw" ########################################## # libnuma probe -libnuma="no" +if test "$libnuma" != "yes" ; then + libnuma="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) @@ -958,10 +1081,10 @@ libnuma="yes" LIBS="-lnuma $LIBS" fi -echo "libnuma $libnuma" +print_config "libnuma" "$libnuma" ########################################## -# libnuma 2.x version API +# libnuma 2.x version API, initialize with "no" only if $libnuma is set to "yes" if test "$libnuma" = "yes" ; then libnuma_v2="no" cat > $TMPC << EOF @@ -975,12 +1098,14 @@ if compile_prog "" "" "libnuma api"; then libnuma_v2="yes" fi -echo "libnuma v2 $libnuma_v2" +print_config "libnuma v2" "$libnuma_v2" fi ########################################## # strsep() probe -strsep="no" +if test "$strsep" != "yes" ; then + strsep="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) @@ -993,11 +1118,13 @@ if compile_prog "" "" "strsep"; then strsep="yes" fi -echo "strsep $strsep" +print_config "strsep" "$strsep" ########################################## # strcasestr() probe -strcasestr="no" +if test "$strcasestr" != "yes" ; then + strcasestr="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) @@ -1008,11 +1135,13 @@ if compile_prog "" "" "strcasestr"; then strcasestr="yes" fi -echo "strcasestr $strcasestr" +print_config "strcasestr" "$strcasestr" ########################################## # strlcat() probe -strlcat="no" +if test "$strlcat" != "yes" ; then + strlcat="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) @@ -1027,11 +1156,13 @@ if compile_prog "" "" "strlcat"; then strlcat="yes" fi -echo "strlcat $strlcat" +print_config "strlcat" "$strlcat" ########################################## # getopt_long_only() probe -getopt_long_only="no" +if test "$getopt_long_only" != "yes" ; then + getopt_long_only="no" +fi cat > $TMPC << EOF #include #include @@ -1045,11 +1176,13 @@ if compile_prog "" "" "getopt_long_only"; then getopt_long_only="yes" fi -echo "getopt_long_only() $getopt_long_only" +print_config "getopt_long_only()" "$getopt_long_only" ########################################## # inet_aton() probe -inet_aton="no" +if test "$inet_aton" != "yes" ; then + inet_aton="no" +fi cat > $TMPC << EOF #include #include @@ -1063,11 +1196,13 @@ if compile_prog "" "" "inet_aton"; then inet_aton="yes" fi -echo "inet_aton $inet_aton" +print_config "inet_aton" "$inet_aton" ########################################## # socklen_t probe -socklen_t="no" +if test "$socklen_t" != "yes" ; then + socklen_t="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) @@ -1079,11 +1214,13 @@ if compile_prog "" "" "socklen_t"; then socklen_t="yes" fi -echo "socklen_t $socklen_t" +print_config "socklen_t" "$socklen_t" ########################################## # Whether or not __thread is supported for TLS -tls_thread="no" +if test "$tls_thread" != "yes" ; then + tls_thread="no" +fi cat > $TMPC << EOF #include static __thread int ret; @@ -1095,11 +1232,13 @@ if compile_prog "" "" "__thread"; then tls_thread="yes" fi -echo "__thread $tls_thread" +print_config "__thread" "$tls_thread" ########################################## # Check if we have required gtk/glib support for gfio -gfio="no" +if test "$gfio" != "yes" ; then + gfio="no" +fi if test "$gfio_check" = "yes" ; then cat > $TMPC << EOF #include @@ -1110,7 +1249,7 @@ gdk_threads_enter(); gdk_threads_leave(); - printf("%d", GTK_CHECK_VERSION(2, 18, 0)); + return GTK_CHECK_VERSION(2, 18, 0) ? 0 : 1; /* 0 on success */ } EOF GTK_CFLAGS=$(pkg-config --cflags gtk+-2.0 gthread-2.0) @@ -1126,8 +1265,8 @@ exit 1 fi if compile_prog "$GTK_CFLAGS" "$GTK_LIBS" "gfio" ; then - r=$($TMPE) - if test "$r" != "0" ; then + $TMPE + if test "$?" = "0" ; then gfio="yes" GFIO_LIBS="$LIBS $GTK_LIBS" CFLAGS="$CFLAGS $GTK_CFLAGS" @@ -1143,11 +1282,14 @@ fi if test "$gfio_check" = "yes" ; then - echo "gtk 2.18 or higher $gfio" + print_config "gtk 2.18 or higher" "$gfio" fi +########################################## # Check whether we have getrusage(RUSAGE_THREAD) -rusage_thread="no" +if test "$rusage_thread" != "yes" ; then + rusage_thread="no" +fi cat > $TMPC << EOF #include #include @@ -1161,11 +1303,13 @@ if compile_prog "" "" "RUSAGE_THREAD"; then rusage_thread="yes" fi -echo "RUSAGE_THREAD $rusage_thread" +print_config "RUSAGE_THREAD" "$rusage_thread" ########################################## # Check whether we have SCHED_IDLE -sched_idle="no" +if test "$sched_idle" != "yes" ; then + sched_idle="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) @@ -1177,11 +1321,13 @@ if compile_prog "" "" "SCHED_IDLE"; then sched_idle="yes" fi -echo "SCHED_IDLE $sched_idle" +print_config "SCHED_IDLE" "$sched_idle" ########################################## # Check whether we have TCP_NODELAY -tcp_nodelay="no" +if test "$tcp_nodelay" != "yes" ; then + tcp_nodelay="no" +fi cat > $TMPC << EOF #include #include @@ -1195,11 +1341,13 @@ if compile_prog "" "" "TCP_NODELAY"; then tcp_nodelay="yes" fi -echo "TCP_NODELAY $tcp_nodelay" +print_config "TCP_NODELAY" "$tcp_nodelay" ########################################## # Check whether we have SO_SNDBUF -window_size="no" +if test "$window_size" != "yes" ; then + window_size="no" +fi cat > $TMPC << EOF #include #include @@ -1214,11 +1362,13 @@ if compile_prog "" "" "SO_SNDBUF"; then window_size="yes" fi -echo "Net engine window_size $window_size" +print_config "Net engine window_size" "$window_size" ########################################## # Check whether we have TCP_MAXSEG -mss="no" +if test "$mss" != "yes" ; then + mss="no" +fi cat > $TMPC << EOF #include #include @@ -1234,11 +1384,13 @@ if compile_prog "" "" "TCP_MAXSEG"; then mss="yes" fi -echo "TCP_MAXSEG $mss" +print_config "TCP_MAXSEG" "$mss" ########################################## # Check whether we have RLIMIT_MEMLOCK -rlimit_memlock="no" +if test "$rlimit_memlock" != "yes" ; then + rlimit_memlock="no" +fi cat > $TMPC << EOF #include #include @@ -1251,11 +1403,13 @@ if compile_prog "" "" "RLIMIT_MEMLOCK"; then rlimit_memlock="yes" fi -echo "RLIMIT_MEMLOCK $rlimit_memlock" +print_config "RLIMIT_MEMLOCK" "$rlimit_memlock" ########################################## # Check whether we have pwritev/preadv -pwritev="no" +if test "$pwritev" != "yes" ; then + pwritev="no" +fi cat > $TMPC << EOF #include #include @@ -1267,11 +1421,13 @@ if compile_prog "" "" "pwritev"; then pwritev="yes" fi -echo "pwritev/preadv $pwritev" +print_config "pwritev/preadv" "$pwritev" ########################################## # Check whether we have pwritev2/preadv2 -pwritev2="no" +if test "$pwritev2" != "yes" ; then + pwritev2="no" +fi cat > $TMPC << EOF #include #include @@ -1283,11 +1439,13 @@ if compile_prog "" "" "pwritev2"; then pwritev2="yes" fi -echo "pwritev2/preadv2 $pwritev2" +print_config "pwritev2/preadv2" "$pwritev2" ########################################## # Check whether we have the required functions for ipv6 -ipv6="no" +if test "$ipv6" != "yes" ; then + ipv6="no" +fi cat > $TMPC << EOF #include #include @@ -1310,25 +1468,30 @@ if compile_prog "" "" "ipv6"; then ipv6="yes" fi -echo "IPv6 helpers $ipv6" +print_config "IPv6 helpers" "$ipv6" ########################################## # check for rbd -rbd="no" +if test "$rbd" != "yes" ; then + rbd="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) { - rados_t cluster; rados_ioctx_t io_ctx; + const char cluster_name[] = "ceph"; + const char user_name[] = "client.admin"; const char pool[] = "rbd"; - int major, minor, extra; - rbd_version(&major, &minor, &extra); + rbd_version(&major, &minor, &extra); + /* The rados_create2 signature required was only introduced in ceph 0.65 */ + rados_create2(&cluster, cluster_name, user_name, 0); rados_ioctx_create(cluster, pool, &io_ctx); + return 0; } EOF @@ -1336,11 +1499,13 @@ LIBS="-lrbd -lrados $LIBS" rbd="yes" fi -echo "Rados Block Device engine $rbd" +print_config "Rados Block Device engine" "$rbd" ########################################## # check for rbd_poll -rbd_poll="no" +if test "$rbd_poll" != "yes" ; then + rbd_poll="no" +fi if test "$rbd" = "yes"; then cat > $TMPC << EOF #include @@ -1361,12 +1526,14 @@ if compile_prog "" "-lrbd -lrados" "rbd"; then rbd_poll="yes" fi -echo "rbd_poll $rbd_poll" +print_config "rbd_poll" "$rbd_poll" fi ########################################## # check for rbd_invaidate_cache() -rbd_inval="no" +if test "$rbd_inval" != "yes" ; then + rbd_inval="no" +fi if test "$rbd" = "yes"; then cat > $TMPC << EOF #include @@ -1381,12 +1548,14 @@ if compile_prog "" "-lrbd -lrados" "rbd"; then rbd_inval="yes" fi -echo "rbd_invalidate_cache $rbd_inval" +print_config "rbd_invalidate_cache" "$rbd_inval" fi ########################################## # check for blkin -rbd_blkin="no" +if test "$rbd_blkin" != "yes" ; then + rbd_blkin="no" +fi cat > $TMPC << EOF #include #include @@ -1410,11 +1579,13 @@ LIBS="-lblkin $LIBS" rbd_blkin="yes" fi -echo "rbd blkin tracing $rbd_blkin" +print_config "rbd blkin tracing" "$rbd_blkin" ########################################## # Check whether we have setvbuf -setvbuf="no" +if test "$setvbuf" != "yes" ; then + setvbuf="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) @@ -1428,16 +1599,18 @@ if compile_prog "" "" "setvbuf"; then setvbuf="yes" fi -echo "setvbuf $setvbuf" +print_config "setvbuf" "$setvbuf" +########################################## # check for gfapi -gfapi="no" +if test "$gfapi" != "yes" ; then + gfapi="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) { - glfs_t *g = glfs_new("foo"); return 0; @@ -1447,10 +1620,10 @@ LIBS="-lgfapi -lglusterfs $LIBS" gfapi="yes" fi - echo "Gluster API engine $gfapi" +print_config "Gluster API engine" "$gfapi" ########################################## -# check for gfapi fadvise support +# check for gfapi fadvise support, initialize with "no" only if $gfapi is set to "yes" if test "$gfapi" = "yes" ; then gf_fadvise="no" cat > $TMPC << EOF @@ -1467,12 +1640,14 @@ if compile_prog "" "-lgfapi -lglusterfs" "gfapi"; then gf_fadvise="yes" fi -echo "Gluster API use fadvise $gf_fadvise" +print_config "Gluster API use fadvise" "$gf_fadvise" fi ########################################## # check for gfapi trim support -gf_trim="no" +if test "$gf_trim" != "yes" ; then + gf_trim="no" +fi if test "$gfapi" = "yes" ; then cat > $TMPC << EOF #include @@ -1485,12 +1660,14 @@ if compile_prog "" "-lgfapi -lglusterfs" "gf trim"; then gf_trim="yes" fi -echo "Gluster API trim support $gf_trim" +print_config "Gluster API trim support" "$gf_trim" fi ########################################## # Check if we support stckf on s390 -s390_z196_facilities="no" +if test "$s390_z196_facilities" != "yes" ; then + s390_z196_facilities="no" +fi cat > $TMPC << EOF #define STFLE_BITS_Z196 45 /* various z196 facilities ... */ int main(int argc, char **argv) @@ -1513,11 +1690,11 @@ EOF if compile_prog "" "" "s390_z196_facilities"; then $TMPE - if [[ $? -eq 0 ]]; then + if [ $? -eq 0 ]; then s390_z196_facilities="yes" fi fi -echo "s390_z196_facilities $s390_z196_facilities" +print_config "s390_z196_facilities" "$s390_z196_facilities" ########################################## # Check if we have required environment variables configured for libhdfs @@ -1543,11 +1720,13 @@ FIO_HDFS_CPU="amd64" fi fi -echo "HDFS engine $libhdfs" +print_config "HDFS engine" "$libhdfs" ########################################## # Check whether we have MTD -mtd="no" +if test "$mtd" != "yes" ; then + mtd="no" +fi cat > $TMPC << EOF #include #include @@ -1564,16 +1743,68 @@ if compile_prog "" "" "mtd"; then mtd="yes" fi -echo "MTD $mtd" +print_config "MTD" "$mtd" + +########################################## +# Check whether we have libpmem +if test "$libpmem" != "yes" ; then + libpmem="no" +fi +cat > $TMPC << EOF +#include +int main(int argc, char **argv) +{ + int rc; + rc = pmem_is_pmem(0, 0); + return 0; +} +EOF +if compile_prog "" "-lpmem" "libpmem"; then + libpmem="yes" + LIBS="-lpmem $LIBS" +fi +print_config "libpmem" "$libpmem" + +########################################## +# Check whether we have libpmemblk +# libpmem is a prerequisite +if test "$libpmemblk" != "yes" ; then + libpmemblk="no" +fi +if test "$libpmem" = "yes"; then + cat > $TMPC << EOF +#include +int main(int argc, char **argv) +{ + PMEMblkpool *pbp; + pbp = pmemblk_open("", 0); + return 0; +} +EOF + if compile_prog "" "-lpmemblk" "libpmemblk"; then + libpmemblk="yes" + LIBS="-lpmemblk $LIBS" + fi +fi +print_config "libpmemblk" "$libpmemblk" + +# Choose the ioengines +if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then + devdax="yes" + if test "$libpmemblk" = "yes"; then + pmemblk="yes" + fi +fi ########################################## # Report whether pmemblk engine is enabled -echo "NVML libpmemblk engine $pmemblk" +print_config "NVML pmemblk engine" "$pmemblk" ########################################## # Report whether dev-dax engine is enabled -echo "NVM Device Dax engine $devdax" +print_config "NVML dev-dax engine" "$devdax" +########################################## # Check if we have lex/yacc available yacc="no" yacc_is_bison="no" @@ -1632,11 +1863,13 @@ fi fi -echo "lex/yacc for arithmetic $arith" +print_config "lex/yacc for arithmetic" "$arith" ########################################## # Check whether we have setmntent/getmntent -getmntent="no" +if test "$getmntent" != "yes" ; then + getmntent="no" +fi cat > $TMPC << EOF #include #include @@ -1651,7 +1884,7 @@ if compile_prog "" "" "getmntent"; then getmntent="yes" fi -echo "getmntent $getmntent" +print_config "getmntent" "$getmntent" ########################################## # Check whether we have getmntinfo @@ -1660,7 +1893,9 @@ # getmntinfo(3) for FreeBSD/DragonFlyBSD/OpenBSD. # Note that NetBSD needs -Werror to catch warning as error. -getmntinfo="no" +if test "$getmntinfo" != "yes" ; then + getmntinfo="no" +fi cat > $TMPC << EOF #include #include @@ -1674,10 +1909,12 @@ if compile_prog "-Werror" "" "getmntinfo"; then getmntinfo="yes" fi -echo "getmntinfo $getmntinfo" +print_config "getmntinfo" "$getmntinfo" # getmntinfo(3) for NetBSD. -getmntinfo_statvfs="no" +if test "$getmntinfo_statvfs" != "yes" ; then + getmntinfo_statvfs="no" +fi cat > $TMPC << EOF #include #include @@ -1690,25 +1927,18 @@ # Skip the test if the one with statfs arg is detected. if test "$getmntinfo" != "yes" && compile_prog "-Werror" "" "getmntinfo_statvfs"; then getmntinfo_statvfs="yes" - echo "getmntinfo_statvfs $getmntinfo_statvfs" + print_config "getmntinfo_statvfs" "$getmntinfo_statvfs" fi ########################################## # Check whether we have _Static_assert -static_assert="no" +if test "$static_assert" != "yes" ; then + static_assert="no" +fi cat > $TMPC << EOF #include #include -#undef offsetof -#ifdef __compiler_offsetof -#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER) -#else -#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) -#endif - -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - offsetof(type,member) );}) +#include struct foo { int a, b; @@ -1723,11 +1953,13 @@ if compile_prog "" "" "static_assert"; then static_assert="yes" fi -echo "Static Assert $static_assert" +print_config "Static Assert" "$static_assert" ########################################## # Check whether we have bool / stdbool.h -have_bool="no" +if test "$have_bool" != "yes" ; then + have_bool="no" +fi cat > $TMPC << EOF #include int main(int argc, char **argv) @@ -1739,7 +1971,67 @@ if compile_prog "" "" "bool"; then have_bool="yes" fi -echo "bool $have_bool" +print_config "bool" "$have_bool" + +########################################## +# Check whether we have strndup() +strndup="no" +cat > $TMPC << EOF +#include +#include +int main(int argc, char **argv) +{ + char *res = strndup("test string", 8); + + free(res); + return 0; +} +EOF +if compile_prog "" "" "strndup"; then + strndup="yes" +fi +print_config "strndup" "$strndup" + +########################################## +# check march=armv8-a+crc+crypto +if test "$march_armv8_a_crc_crypto" != "yes" ; then + march_armv8_a_crc_crypto="no" +fi +if test "$cpu" = "arm64" ; then + cat > $TMPC < +#include +#include + +int main(void) +{ + return 0; +} +EOF + if compile_prog "-march=armv8-a+crc+crypto" "" ""; then + march_armv8_a_crc_crypto="yes" + CFLAGS="$CFLAGS -march=armv8-a+crc+crypto -DARCH_HAVE_CRC_CRYPTO" + fi +fi +print_config "march_armv8_a_crc_crypto" "$march_armv8_a_crc_crypto" + +########################################## +# cuda probe +if test "$cuda" != "yes" ; then + cuda="no" +fi +cat > $TMPC << EOF +#include +int main(int argc, char **argv) +{ + return cuInit(0); +} +EOF +if test "$enable_cuda" = "yes" && compile_prog "" "-lcuda" "cuda"; then + cuda="yes" + LIBS="-lcuda $LIBS" +fi +print_config "cuda" "$cuda" ############################################################################# @@ -1767,6 +2059,9 @@ if test "$posix_aio_fsync" = "yes" ; then output_sym "CONFIG_POSIXAIO_FSYNC" fi +if test "$posix_pshared" = "yes" ; then + output_sym "CONFIG_PSHARED" +fi if test "$linux_fallocate" = "yes" ; then output_sym "CONFIG_LINUX_FALLOCATE" fi @@ -1854,7 +2149,7 @@ output_sym "CONFIG_RUSAGE_THREAD" fi if test "$gfio" = "yes" ; then - echo "CONFIG_GFIO=y" >> $config_host_mak + output_sym "CONFIG_GFIO" fi if test "$esx" = "yes" ; then output_sym "CONFIG_ESX" @@ -1954,10 +2249,18 @@ if test "$have_bool" = "yes" ; then output_sym "CONFIG_HAVE_BOOL" fi - +if test "$strndup" = "yes" ; then + output_sym "CONFIG_HAVE_STRNDUP" +fi +if test "$disable_opt" = "yes" ; then + output_sym "CONFIG_DISABLE_OPTIMIZATIONS" +fi if test "$zlib" = "no" ; then echo "Consider installing zlib-dev (zlib-devel), some fio features depend on it." fi +if test "$cuda" = "yes" ; then + output_sym "CONFIG_CUDA" +fi echo "LIBS+=$LIBS" >> $config_host_mak echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak diff -Nru fio-2.16/crc/crc32c-arm64.c fio-3.1/crc/crc32c-arm64.c --- fio-2.16/crc/crc32c-arm64.c 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/crc/crc32c-arm64.c 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,114 @@ +#include "crc32c.h" + +#define CRC32C3X8(ITR) \ + crc1 = __crc32cd(crc1, *((const uint64_t *)data + 42*1 + (ITR)));\ + crc2 = __crc32cd(crc2, *((const uint64_t *)data + 42*2 + (ITR)));\ + crc0 = __crc32cd(crc0, *((const uint64_t *)data + 42*0 + (ITR))); + +#define CRC32C7X3X8(ITR) do {\ + CRC32C3X8((ITR)*7+0) \ + CRC32C3X8((ITR)*7+1) \ + CRC32C3X8((ITR)*7+2) \ + CRC32C3X8((ITR)*7+3) \ + CRC32C3X8((ITR)*7+4) \ + CRC32C3X8((ITR)*7+5) \ + CRC32C3X8((ITR)*7+6) \ + } while(0) + +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif /* HWCAP_CRC32 */ + +bool crc32c_arm64_available = false; + +#ifdef ARCH_HAVE_ARM64_CRC_CRYPTO + +#include +#include +#include + +static bool crc32c_probed; + +/* + * Function to calculate reflected crc with PMULL Instruction + * crc done "by 3" for fixed input block size of 1024 bytes + */ +uint32_t crc32c_arm64(unsigned char const *data, unsigned long length) +{ + signed long len = length; + uint32_t crc = ~0; + uint32_t crc0, crc1, crc2; + + /* Load two consts: K1 and K2 */ + const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014; + uint64_t t0, t1; + + while ((len -= 1024) >= 0) { + /* Do first 8 bytes here for better pipelining */ + crc0 = __crc32cd(crc, *(const uint64_t *)data); + crc1 = 0; + crc2 = 0; + data += sizeof(uint64_t); + + /* Process block inline + Process crc0 last to avoid dependency with above */ + CRC32C7X3X8(0); + CRC32C7X3X8(1); + CRC32C7X3X8(2); + CRC32C7X3X8(3); + CRC32C7X3X8(4); + CRC32C7X3X8(5); + + data += 42*3*sizeof(uint64_t); + + /* Merge crc0 and crc1 into crc2 + crc1 multiply by K2 + crc0 multiply by K1 */ + + t1 = (uint64_t)vmull_p64(crc1, k2); + t0 = (uint64_t)vmull_p64(crc0, k1); + crc = __crc32cd(crc2, *(const uint64_t *)data); + crc1 = __crc32cd(0, t1); + crc ^= crc1; + crc0 = __crc32cd(0, t0); + crc ^= crc0; + + data += sizeof(uint64_t); + } + + if (!(len += 1024)) + return crc; + + while ((len -= sizeof(uint64_t)) >= 0) { + crc = __crc32cd(crc, *(const uint64_t *)data); + data += sizeof(uint64_t); + } + + /* The following is more efficient than the straight loop */ + if (len & sizeof(uint32_t)) { + crc = __crc32cw(crc, *(const uint32_t *)data); + data += sizeof(uint32_t); + } + if (len & sizeof(uint16_t)) { + crc = __crc32ch(crc, *(const uint16_t *)data); + data += sizeof(uint16_t); + } + if (len & sizeof(uint8_t)) { + crc = __crc32cb(crc, *(const uint8_t *)data); + } + + return crc; +} + +void crc32c_arm64_probe(void) +{ + unsigned long hwcap; + + if (!crc32c_probed) { + hwcap = getauxval(AT_HWCAP); + crc32c_arm64_available = (hwcap & HWCAP_CRC32) != 0; + crc32c_probed = true; + } +} + +#endif /* ARCH_HAVE_ARM64_CRC_CRYPTO */ diff -Nru fio-2.16/crc/crc32c.h fio-3.1/crc/crc32c.h --- fio-2.16/crc/crc32c.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/crc/crc32c.h 2017-09-28 10:23:20.000000000 +0000 @@ -19,9 +19,21 @@ #define CRC32C_H #include "../arch/arch.h" +#include "../lib/types.h" extern uint32_t crc32c_sw(unsigned char const *, unsigned long); -extern int crc32c_intel_available; +extern bool crc32c_arm64_available; +extern bool crc32c_intel_available; + +#ifdef ARCH_HAVE_ARM64_CRC_CRYPTO +extern uint32_t crc32c_arm64(unsigned char const *, unsigned long); +extern void crc32c_arm64_probe(void); +#else +#define crc32c_arm64 crc32c_sw +static inline void crc32c_arm64_probe(void) +{ +} +#endif #ifdef ARCH_HAVE_SSE4_2 extern uint32_t crc32c_intel(unsigned char const *, unsigned long); @@ -35,6 +47,9 @@ static inline uint32_t fio_crc32c(unsigned char const *buf, unsigned long len) { + if (crc32c_arm64_available) + return crc32c_arm64(buf, len); + if (crc32c_intel_available) return crc32c_intel(buf, len); diff -Nru fio-2.16/crc/crc32c-intel.c fio-3.1/crc/crc32c-intel.c --- fio-2.16/crc/crc32c-intel.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/crc/crc32c-intel.c 2017-09-28 10:23:20.000000000 +0000 @@ -18,7 +18,7 @@ * Volume 2A: Instruction Set Reference, A-M */ -int crc32c_intel_available = 0; +bool crc32c_intel_available = false; #ifdef ARCH_HAVE_SSE4_2 @@ -30,7 +30,7 @@ #define SCALE_F 4 #endif -static int crc32c_probed; +static bool crc32c_probed; static uint32_t crc32c_intel_le_hw_byte(uint32_t crc, unsigned char const *data, unsigned long length) @@ -87,7 +87,7 @@ do_cpuid(&eax, &ebx, &ecx, &edx); crc32c_intel_available = (ecx & (1 << 20)) != 0; - crc32c_probed = 1; + crc32c_probed = true; } } diff -Nru fio-2.16/crc/fnv.c fio-3.1/crc/fnv.c --- fio-2.16/crc/fnv.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/crc/fnv.c 2017-09-28 10:23:20.000000000 +0000 @@ -2,14 +2,32 @@ #define FNV_PRIME 0x100000001b3ULL +/* + * 64-bit fnv, but don't require 64-bit multiples of data. Use bytes + * for the last unaligned chunk. + */ uint64_t fnv(const void *buf, uint32_t len, uint64_t hval) { const uint64_t *ptr = buf; - const uint64_t *end = (void *) buf + len; - while (ptr < end) { + while (len) { hval *= FNV_PRIME; - hval ^= (uint64_t) *ptr++; + if (len >= sizeof(uint64_t)) { + hval ^= (uint64_t) *ptr++; + len -= sizeof(uint64_t); + continue; + } else { + const uint8_t *ptr8 = (const uint8_t *) ptr; + uint64_t val = 0; + int i; + + for (i = 0; i < len; i++) { + val <<= 8; + val |= (uint8_t) *ptr8++; + } + hval ^= val; + break; + } } return hval; diff -Nru fio-2.16/crc/sha3.c fio-3.1/crc/sha3.c --- fio-2.16/crc/sha3.c 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/crc/sha3.c 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,173 @@ +/* + * Cryptographic API. + * + * SHA-3, as specified in + * http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf + * + * SHA-3 code by Jeff Garzik + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option)• + * any later version. + * + */ +#include +#include + +#include "../os/os.h" + +#include "sha3.h" + +#define KECCAK_ROUNDS 24 + +#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y)))) + +static const uint64_t keccakf_rndc[24] = { + 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, + 0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL, + 0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL, + 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL, + 0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, + 0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL, + 0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL, + 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL +}; + +static const int keccakf_rotc[24] = { + 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, + 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 +}; + +static const int keccakf_piln[24] = { + 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, + 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 +}; + +/* update the state with given number of rounds */ + +static void keccakf(uint64_t st[25]) +{ + int i, j, round; + uint64_t t, bc[5]; + + for (round = 0; round < KECCAK_ROUNDS; round++) { + + /* Theta */ + for (i = 0; i < 5; i++) + bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] + ^ st[i + 20]; + + for (i = 0; i < 5; i++) { + t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); + for (j = 0; j < 25; j += 5) + st[j + i] ^= t; + } + + /* Rho Pi */ + t = st[1]; + for (i = 0; i < 24; i++) { + j = keccakf_piln[i]; + bc[0] = st[j]; + st[j] = ROTL64(t, keccakf_rotc[i]); + t = bc[0]; + } + + /* Chi */ + for (j = 0; j < 25; j += 5) { + for (i = 0; i < 5; i++) + bc[i] = st[j + i]; + for (i = 0; i < 5; i++) + st[j + i] ^= (~bc[(i + 1) % 5]) & + bc[(i + 2) % 5]; + } + + /* Iota */ + st[0] ^= keccakf_rndc[round]; + } +} + +static void fio_sha3_init(struct fio_sha3_ctx *sctx, unsigned int digest_sz) +{ + memset(sctx->st, 0, sizeof(sctx->st)); + sctx->md_len = digest_sz; + sctx->rsiz = 200 - 2 * digest_sz; + sctx->rsizw = sctx->rsiz / 8; + sctx->partial = 0; + memset(sctx->buf, 0, sizeof(sctx->buf)); +} + +void fio_sha3_224_init(struct fio_sha3_ctx *sctx) +{ + fio_sha3_init(sctx, SHA3_224_DIGEST_SIZE); +} + +void fio_sha3_256_init(struct fio_sha3_ctx *sctx) +{ + fio_sha3_init(sctx, SHA3_256_DIGEST_SIZE); +} + +void fio_sha3_384_init(struct fio_sha3_ctx *sctx) +{ + fio_sha3_init(sctx, SHA3_384_DIGEST_SIZE); +} + +void fio_sha3_512_init(struct fio_sha3_ctx *sctx) +{ + fio_sha3_init(sctx, SHA3_512_DIGEST_SIZE); +} + +int fio_sha3_update(struct fio_sha3_ctx *sctx, const uint8_t *data, + unsigned int len) +{ + unsigned int done; + const uint8_t *src; + + done = 0; + src = data; + + if ((sctx->partial + len) > (sctx->rsiz - 1)) { + if (sctx->partial) { + done = -sctx->partial; + memcpy(sctx->buf + sctx->partial, data, + done + sctx->rsiz); + src = sctx->buf; + } + + do { + unsigned int i; + + for (i = 0; i < sctx->rsizw; i++) + sctx->st[i] ^= ((uint64_t *) src)[i]; + keccakf(sctx->st); + + done += sctx->rsiz; + src = data + done; + } while (done + (sctx->rsiz - 1) < len); + + sctx->partial = 0; + } + memcpy(sctx->buf + sctx->partial, src, len - done); + sctx->partial += (len - done); + + return 0; +} + +void fio_sha3_final(struct fio_sha3_ctx *sctx) +{ + unsigned int i, inlen = sctx->partial; + + sctx->buf[inlen++] = 0x06; + memset(sctx->buf + inlen, 0, sctx->rsiz - inlen); + sctx->buf[sctx->rsiz - 1] |= 0x80; + + for (i = 0; i < sctx->rsizw; i++) + sctx->st[i] ^= ((uint64_t *) sctx->buf)[i]; + + keccakf(sctx->st); + + for (i = 0; i < sctx->rsizw; i++) + sctx->st[i] = cpu_to_le64(sctx->st[i]); + + memcpy(sctx->sha, sctx->st, sctx->md_len); +} diff -Nru fio-2.16/crc/sha3.h fio-3.1/crc/sha3.h --- fio-2.16/crc/sha3.h 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/crc/sha3.h 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,42 @@ +/* + * Common values for SHA-3 algorithms + */ +#ifndef __CRYPTO_SHA3_H__ +#define __CRYPTO_SHA3_H__ + +#include + +#define SHA3_224_DIGEST_SIZE (224 / 8) +#define SHA3_224_BLOCK_SIZE (200 - 2 * SHA3_224_DIGEST_SIZE) + +#define SHA3_256_DIGEST_SIZE (256 / 8) +#define SHA3_256_BLOCK_SIZE (200 - 2 * SHA3_256_DIGEST_SIZE) + +#define SHA3_384_DIGEST_SIZE (384 / 8) +#define SHA3_384_BLOCK_SIZE (200 - 2 * SHA3_384_DIGEST_SIZE) + +#define SHA3_512_DIGEST_SIZE (512 / 8) +#define SHA3_512_BLOCK_SIZE (200 - 2 * SHA3_512_DIGEST_SIZE) + +struct fio_sha3_ctx { + uint64_t st[25]; + unsigned int md_len; + unsigned int rsiz; + unsigned int rsizw; + + unsigned int partial; + uint8_t buf[SHA3_224_BLOCK_SIZE]; + + uint8_t *sha; +}; + +void fio_sha3_224_init(struct fio_sha3_ctx *sctx); +void fio_sha3_256_init(struct fio_sha3_ctx *sctx); +void fio_sha3_384_init(struct fio_sha3_ctx *sctx); +void fio_sha3_512_init(struct fio_sha3_ctx *sctx); + +int fio_sha3_update(struct fio_sha3_ctx *sctx, const uint8_t *data, + unsigned int len); +void fio_sha3_final(struct fio_sha3_ctx *sctx); + +#endif diff -Nru fio-2.16/crc/test.c fio-3.1/crc/test.c --- fio-2.16/crc/test.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/crc/test.c 2017-09-28 10:23:20.000000000 +0000 @@ -16,6 +16,7 @@ #include "../crc/sha1.h" #include "../crc/sha256.h" #include "../crc/sha512.h" +#include "../crc/sha3.h" #include "../crc/xxhash.h" #include "../crc/murmur3.h" #include "../crc/fnv.h" @@ -47,6 +48,10 @@ T_MURMUR3 = 1U << 10, T_JHASH = 1U << 11, T_FNV = 1U << 12, + T_SHA3_224 = 1U << 13, + T_SHA3_256 = 1U << 14, + T_SHA3_384 = 1U << 15, + T_SHA3_512 = 1U << 16, }; static void t_md5(struct test_type *t, void *buf, size_t size) @@ -143,6 +148,62 @@ fio_sha512_update(&ctx, buf, size); } +static void t_sha3_224(struct test_type *t, void *buf, size_t size) +{ + uint8_t sha[SHA3_224_DIGEST_SIZE]; + struct fio_sha3_ctx ctx = { .sha = sha }; + int i; + + fio_sha3_224_init(&ctx); + + for (i = 0; i < NR_CHUNKS; i++) { + fio_sha3_update(&ctx, buf, size); + fio_sha3_final(&ctx); + } +} + +static void t_sha3_256(struct test_type *t, void *buf, size_t size) +{ + uint8_t sha[SHA3_256_DIGEST_SIZE]; + struct fio_sha3_ctx ctx = { .sha = sha }; + int i; + + fio_sha3_256_init(&ctx); + + for (i = 0; i < NR_CHUNKS; i++) { + fio_sha3_update(&ctx, buf, size); + fio_sha3_final(&ctx); + } +} + +static void t_sha3_384(struct test_type *t, void *buf, size_t size) +{ + uint8_t sha[SHA3_384_DIGEST_SIZE]; + struct fio_sha3_ctx ctx = { .sha = sha }; + int i; + + fio_sha3_384_init(&ctx); + + for (i = 0; i < NR_CHUNKS; i++) { + fio_sha3_update(&ctx, buf, size); + fio_sha3_final(&ctx); + } +} + +static void t_sha3_512(struct test_type *t, void *buf, size_t size) +{ + uint8_t sha[SHA3_512_DIGEST_SIZE]; + struct fio_sha3_ctx ctx = { .sha = sha }; + int i; + + fio_sha3_512_init(&ctx); + + for (i = 0; i < NR_CHUNKS; i++) { + fio_sha3_update(&ctx, buf, size); + fio_sha3_final(&ctx); + } +} + static void t_murmur3(struct test_type *t, void *buf, size_t size) { int i; @@ -247,6 +308,26 @@ .fn = t_fnv, }, { + .name = "sha3-224", + .mask = T_SHA3_224, + .fn = t_sha3_224, + }, + { + .name = "sha3-256", + .mask = T_SHA3_256, + .fn = t_sha3_256, + }, + { + .name = "sha3-384", + .mask = T_SHA3_384, + .fn = t_sha3_384, + }, + { + .name = "sha3-512", + .mask = T_SHA3_512, + .fn = t_sha3_512, + }, + { .name = NULL, }, }; @@ -291,6 +372,7 @@ int i, first = 1; void *buf; + crc32c_arm64_probe(); crc32c_intel_probe(); if (!type) @@ -310,7 +392,7 @@ fill_random_buf(&state, buf, CHUNK); for (i = 0; t[i].name; i++) { - struct timeval tv; + struct timespec ts; double mb_sec; uint64_t usec; char pre[3]; @@ -327,9 +409,9 @@ t[i].fn(&t[i], buf, CHUNK); } - fio_gettime(&tv, NULL); + fio_gettime(&ts, NULL); t[i].fn(&t[i], buf, CHUNK); - usec = utime_since_now(&tv); + usec = utime_since_now(&ts); if (usec) { mb_sec = (double) mb / (double) usec; @@ -338,9 +420,9 @@ sprintf(pre, "\t"); else sprintf(pre, "\t\t"); - printf("%s:%s%8.2f MB/sec\n", t[i].name, pre, mb_sec); + printf("%s:%s%8.2f MiB/sec\n", t[i].name, pre, mb_sec); } else - printf("%s:inf MB/sec\n", t[i].name); + printf("%s:inf MiB/sec\n", t[i].name); first = 0; } diff -Nru fio-2.16/debian/changelog fio-3.1/debian/changelog --- fio-2.16/debian/changelog 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/changelog 2017-10-24 07:47:45.000000000 +0000 @@ -1,3 +1,27 @@ +fio (3.1-1) unstable; urgency=medium + + * Imported upstream versions 3.0 and 3.1 (Closes: #869686). + * Adapted my mail address to new company domain. + * patches: + - Refreshed. + - Dropped patch spelling-errors. Applied upstream. + - Changed author in all of my patches to new mail address. + * control: Bumped standards version to 4.1.1.0. + * watch: Changed to pgpsigurlmangle as recommended by policy 4.1.0.0, + section 4.11. + + -- Martin Steigerwald Tue, 24 Oct 2017 09:47:45 +0200 + +fio (2.17-1) unstable; urgency=medium + + * Imported upstream version 2.17. + * debian/patches: Refreshed. + * debian/copyright: Updated, added some files, all GPL-2. + * debian/docs: Added fio_latency2csv.py, example systemd fio.service file, and + logparser histogram scripts from tools/hist. + + -- Martin Steigerwald Mon, 23 Jan 2017 11:10:42 +0100 + fio (2.16-1) unstable; urgency=medium * Imported upstream version 2.16. diff -Nru fio-2.16/debian/control fio-3.1/debian/control --- fio-2.16/debian/control 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/control 2017-10-24 07:47:45.000000000 +0000 @@ -1,9 +1,9 @@ Source: fio Section: utils Priority: optional -Maintainer: Martin Steigerwald +Maintainer: Martin Steigerwald Build-Depends: debhelper (>= 9), dpkg-dev (>= 1.16.1~), libaio-dev, zlib1g-dev, librdmacm-dev, libibverbs-dev, librbd-dev, libgtk2.0-dev, libcairo2-dev -Standards-Version: 3.9.8 +Standards-Version: 4.1.1.0 Homepage: https://github.com/axboe/fio Vcs-Git: https://anonscm.debian.org/git/collab-maint/fio.git Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/fio.git diff -Nru fio-2.16/debian/copyright fio-3.1/debian/copyright --- fio-2.16/debian/copyright 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/copyright 2017-10-24 07:47:45.000000000 +0000 @@ -5,11 +5,12 @@ Files: * Copyright: 2005 Jens Axboe - 2006-2012 Jens Axboe + 2006-2017 Jens Axboe License: GPL-2 Files: debian/* -Copyright: 2009-2012 Martin Steigerwald +Copyright: 2009-2017 Martin Steigerwald Comment: This package was debianized by Martin Steigerwald on Tue, 19 May 2009 15:04:02 +0200. @@ -64,7 +65,7 @@ License: GPL-2+ Files: crc/crc32c-intel.c -Copyright: Based on a posting to lkml by Austin Zhang +Copyright: Based on a posting to LKML by Austin Zhang License: GPL-2+ Files: crc/md5.c @@ -91,11 +92,27 @@ Copyright: 2012-2014 Yann Collet License: BSD-2-clause +Files: engines/dev-dax.c +Copyright: 2016 Intel Corp +License: GPL-2 + Files: engines/fusion-aw.c Copyright: 2013 Fusion-io, Inc. Santhosh Kumar Koundinya License: GPL-2 +Files: engines/pmemblk.c +Copyright: 2016 Hewlett Packard Enterprise Development LP +License: GPL-2 + +Files: examples/gfapi.fio +Copyright: Originally authored by Castor Fu +License: GPL-2 + +Files: examples/jesd219.fio +Copyright: Based on a posting from Jeff Furlong +License: GPL-2 + Files: exp/expression-parser.l exp/expression-parser.y exp/test-expression-parser.c Copyright: 2014, Stephen M. Cameron License: GPL-2 @@ -142,7 +159,7 @@ License: GPL-2+ Files: oslib/libmtd_common.h -Copyright: 2007,2008 Artem Bityutskiy +Copyright: 2007, 2008 Artem Bityutskiy License: GPL-2+ Files: oslib/libmtd_legacy.c @@ -158,6 +175,10 @@ 2006 KaiGai Kohei License: GPL-2+ +Files: t/read-to-pipe-async.c +Copyright: 2016 Jens Axboe +License: GPL-2+ + Files: tools/fio_generate_plots.1 Copyright: Written by Martin Steigerwald License: GPL-2 @@ -167,6 +188,10 @@ 2016 Ben England License: GPL-2 +Files: tools/hist/* +Copyright: Karl Cronburg +License: GPL-2 + Files: tools/genfio tools/plot/fio2gnuplot tools/plot/fio2gnuplot.1 tools/plot/fio2gnuplot.manpage tools/plot/graph2D.gpm tools/plot/graph3D.gpm tools/plot/math.gpm Copyright: 2013 eNovance SAS Erwan Velu diff -Nru fio-2.16/debian/docs fio-3.1/debian/docs --- fio-2.16/debian/docs 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/docs 2017-10-24 07:47:45.000000000 +0000 @@ -3,4 +3,9 @@ MORAL-LICENSE REPORTING-BUGS examples/ +tools/hist/fiologparser_hist.py +tools/hist/fiologparser_hist.py.1 +tools/hist/half-bins.py +tools/fio_latency2csv.py +tools/fio.service tools/fiologparser.py diff -Nru fio-2.16/debian/patches/configure-no-configlog fio-3.1/debian/patches/configure-no-configlog --- fio-2.16/debian/patches/configure-no-configlog 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/patches/configure-no-configlog 2017-10-24 07:47:45.000000000 +0000 @@ -1,9 +1,9 @@ Description: Remove config.log to fix dpkg-source error about changed files. -Author: Martin Steigerwald +Author: Martin Steigerwald --- a/configure +++ b/configure -@@ -1973,3 +1973,5 @@ +@@ -2276,3 +2276,5 @@ include \$(SRCDIR)/Makefile EOF fi diff -Nru fio-2.16/debian/patches/fio2gnuplot-manpage fio-3.1/debian/patches/fio2gnuplot-manpage --- fio-2.16/debian/patches/fio2gnuplot-manpage 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/patches/fio2gnuplot-manpage 2017-10-24 07:47:45.000000000 +0000 @@ -1,11 +1,11 @@ Description: Fix tag lintian manpage-section-mismatch. -Author: Martin Steigerwald +Author: Martin Steigerwald --- a/tools/plot/fio2gnuplot.1 +++ b/tools/plot/fio2gnuplot.1 @@ -1,5 +1,5 @@ .\" Text automatically generated by txt2man --.TH fio2gnuplot "07 août 2013" "" "" +-.TH fio2gnuplot 1 "August 2013" +.TH fio2gnuplot 1 "07 August 2013" "User Manual" .SH NAME \fBfio2gnuplot \fP- Render fio's output files with gnuplot diff -Nru fio-2.16/debian/patches/fix-ftbfs-with-libmtd.h fio-3.1/debian/patches/fix-ftbfs-with-libmtd.h --- fio-2.16/debian/patches/fix-ftbfs-with-libmtd.h 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/patches/fix-ftbfs-with-libmtd.h 2017-10-24 07:47:45.000000000 +0000 @@ -1,5 +1,5 @@ Description: fix FTBFS libmtd.h:288:8: error: unknown type name 'uint8_t' (Debian Bug 815735) -Author: Martin Steigerwald +Author: Martin Steigerwald --- a/oslib/libmtd.h +++ b/oslib/libmtd.h diff -Nru fio-2.16/debian/patches/makefile-hardening fio-3.1/debian/patches/makefile-hardening --- fio-2.16/debian/patches/makefile-hardening 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/patches/makefile-hardening 2017-10-24 07:47:45.000000000 +0000 @@ -1,5 +1,5 @@ Description: Keep hardening build flags. -Author: Martin Steigerwald +Author: Martin Steigerwald --- a/Makefile +++ b/Makefile diff -Nru fio-2.16/debian/patches/makefile-manpagepath fio-3.1/debian/patches/makefile-manpagepath --- fio-2.16/debian/patches/makefile-manpagepath 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/patches/makefile-manpagepath 2017-10-24 07:47:45.000000000 +0000 @@ -1,9 +1,9 @@ Description: Adapt manpage path to Debian. -Author: Martin Steigerwald +Author: Martin Steigerwald --- a/Makefile +++ b/Makefile -@@ -298,7 +298,7 @@ +@@ -306,7 +306,7 @@ mandir = /usr/share/man sharedir = /usr/share/fio else diff -Nru fio-2.16/debian/patches/reproducible-build fio-3.1/debian/patches/reproducible-build --- fio-2.16/debian/patches/reproducible-build 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/patches/reproducible-build 2017-10-24 07:47:45.000000000 +0000 @@ -3,7 +3,7 @@ --- a/Makefile +++ b/Makefile -@@ -185,7 +185,7 @@ +@@ -188,7 +188,7 @@ CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format -static endif diff -Nru fio-2.16/debian/patches/series fio-3.1/debian/patches/series --- fio-2.16/debian/patches/series 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/patches/series 2017-10-24 07:47:45.000000000 +0000 @@ -2,5 +2,4 @@ fio2gnuplot-manpage configure-no-configlog fix-ftbfs-with-libmtd.h -spelling-errors reproducible-build diff -Nru fio-2.16/debian/patches/spelling-errors fio-3.1/debian/patches/spelling-errors --- fio-2.16/debian/patches/spelling-errors 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/patches/spelling-errors 1970-01-01 00:00:00.000000000 +0000 @@ -1,79 +0,0 @@ -Description: Fix some spelling errors in fio binary, fio manpage and HOWTO. -Author: Martin Steigerwald - ---- a/HOWTO -+++ b/HOWTO -@@ -685,13 +685,13 @@ - the next. Multiple files can still be - open depending on 'openfiles'. - -- zipf Use a zipfian distribution to decide what file -+ zipf Use a Zipfian distribution to decide what file - to access. - -- pareto Use a pareto distribution to decide what file -+ pareto Use a Pareto distribution to decide what file - to access. - -- gauss Use a gaussian (normal) distribution to decide -+ gauss Use a Gaussian (normal) distribution to decide - what file to access. - - For random, roundrobin, and sequential, a postfix can be -@@ -998,7 +998,7 @@ - random Uniform random distribution - zipf Zipf distribution - pareto Pareto distribution -- gauss Normal (gaussian) distribution -+ gauss Normal (Gaussian) distribution - zoned Zoned random distribution - - When using a zipf or pareto distribution, an input value -@@ -1696,7 +1696,7 @@ - - log_hist_msec=int Same as log_avg_msec, but logs entries for completion - latency histograms. Computing latency percentiles from averages of -- intervals using log_avg_msec is innacurate. Setting this option makes -+ intervals using log_avg_msec is inacurate. Setting this option makes - fio log histogram entries over the specified period of time, reducing - log sizes for high IOPS devices while retaining percentile accuracy. - See log_hist_coarseness as well. Defaults to 0, meaning histogram ---- a/fio.1 -+++ b/fio.1 -@@ -592,13 +592,13 @@ - Do each file in the set sequentially. - .TP - .B zipf --Use a zipfian distribution to decide what file to access. -+Use a Zipfian distribution to decide what file to access. - .TP - .B pareto --Use a pareto distribution to decide what file to access. -+Use a Pareto distribution to decide what file to access. - .TP - .B gauss --Use a gaussian (normal) distribution to decide what file to access. -+Use a Gaussian (normal) distribution to decide what file to access. - .RE - .P - For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be -@@ -1575,7 +1575,7 @@ - .BI log_hist_msec \fR=\fPint - Same as \fBlog_avg_msec\fR, but logs entries for completion latency histograms. - Computing latency percentiles from averages of intervals using \fBlog_avg_msec\fR --is innacurate. Setting this option makes fio log histogram entries over the -+is inacurate. Setting this option makes fio log histogram entries over the - specified period of time, reducing log sizes for high IOPS devices while - retaining percentile accuracy. See \fBlog_hist_coarseness\fR as well. Defaults - to 0, meaning histogram logging is disabled. ---- a/options.c -+++ b/options.c -@@ -2234,7 +2234,7 @@ - }, - { .ival = "gauss", - .oval = FIO_FSERVICE_GAUSS, -- .help = "Normal (gaussian) distribution", -+ .help = "Normal (Gaussian) distribution", - }, - { .ival = "roundrobin", - .oval = FIO_FSERVICE_RR, diff -Nru fio-2.16/debian/watch fio-3.1/debian/watch --- fio-2.16/debian/watch 2016-12-23 16:09:43.000000000 +0000 +++ fio-3.1/debian/watch 2017-10-24 07:47:45.000000000 +0000 @@ -1,3 +1,3 @@ version=4 -opts="pgpmode=next" http://brick.kernel.dk/snaps/fio-(\d.*)\.tar\.gz -opts="pgpmode=previous" http://brick.kernel.dk/snaps/fio-(\d.*)\.tar\.gz\.asc +opts=pgpmode=mangle +opts=pgpsigurlmangle=s/$/.asc/ http://brick.kernel.dk/snaps/fio-(\d.*)\.tar\.gz diff -Nru fio-2.16/diskutil.c fio-3.1/diskutil.c --- fio-2.16/diskutil.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/diskutil.c 2017-09-28 10:23:20.000000000 +0000 @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -18,8 +19,6 @@ static struct fio_mutex *disk_util_mutex; -FLIST_HEAD(disk_list); - static struct disk_util *__init_per_file_disk_util(struct thread_data *td, int majdev, int mindev, char *path); @@ -37,6 +36,7 @@ } fio_mutex_remove(du->lock); + free(du->sysfs_root); sfree(du); } @@ -85,7 +85,7 @@ static void update_io_tick_disk(struct disk_util *du) { struct disk_util_stat __dus, *dus, *ldus; - struct timeval t; + struct timespec t; if (!du->users) return; @@ -305,7 +305,7 @@ return NULL; } strncpy((char *) du->dus.name, basename(path), FIO_DU_NAME_SZ - 1); - du->sysfs_root = path; + du->sysfs_root = strdup(path); du->major = majdev; du->minor = mindev; INIT_FLIST_HEAD(&du->slavelist); @@ -364,7 +364,7 @@ return 0; while ((dir = readdir(D)) != NULL) { - char full_path[256]; + char full_path[257]; if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, "..")) continue; @@ -430,9 +430,6 @@ sprintf(path, "%s", tmp); } - if (td->o.ioscheduler && !td->sysfs_root) - td->sysfs_root = strdup(path); - return disk_util_add(td, majdev, mindev, path); } @@ -451,12 +448,8 @@ mindev); du = disk_util_exists(majdev, mindev); - if (du) { - if (td->o.ioscheduler && !td->sysfs_root) - td->sysfs_root = strdup(du->sysfs_root); - + if (du) return du; - } /* * for an fs without a device, we will repeatedly stat through diff -Nru fio-2.16/diskutil.h fio-3.1/diskutil.h --- fio-2.16/diskutil.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/diskutil.h 2017-09-28 10:23:20.000000000 +0000 @@ -46,7 +46,6 @@ */ struct flist_head slavelist; - char *name; char *sysfs_root; char path[PATH_MAX]; int major, minor; @@ -65,7 +64,7 @@ */ struct flist_head slaves; - struct timeval time; + struct timespec time; struct fio_mutex *lock; unsigned long users; @@ -115,6 +114,7 @@ extern void setup_disk_util(void); extern void disk_util_prune_entries(void); #else +/* keep this as a function to avoid a warning in handle_du() */ static inline void print_disk_util(struct disk_util_stat *du, struct disk_util_agg *agg, int terse, struct buf_output *out) diff -Nru fio-2.16/doc/conf.py fio-3.1/doc/conf.py --- fio-2.16/doc/conf.py 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/doc/conf.py 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,360 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# fio documentation build configuration file, created by +# sphinx-quickstart on Mon Nov 14 13:56:30 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +# +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'fio' +copyright = '2017, Jens Axboe ' +author = 'Jens Axboe ' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# + +# The short X.Y version. +# version = '1' +# The full version, including alpha/beta/rc tags. +# release = '1' + +def fio_version(): + + from os.path import exists, dirname, join + wsroot = dirname(dirname(__file__)) + version_file = join(wsroot, "FIO-VERSION-FILE") + if not exists(version_file): + version_gen = join(wsroot, "FIO-VERSION-GEN") + from subprocess import call + rc = call(version_gen, shell=True, cwd=wsroot) + if rc: + print("Couldn't generate version file. rc=%r" % rc) + return "Unknown", "Unknown" + + vsl = open(version_file).read().strip().split('-') + version = vsl[1] + release = '-'.join(vsl[1:]) + return version, release + +version, release = fio_version() + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# +# today = '' +# +# Else, today_fmt is used as the format for a strftime call. +# +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['output', 'Thumbs.db', '.DS_Store', 'fio_examples.rst'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +# +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. +# " v documentation" by default. +# +# html_title = 'fio v1' + +# A shorter title for the navigation bar. Default is the same as html_title. +# +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# +# html_logo = None + +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +# +# html_extra_path = [] + +# If not None, a 'Last updated on:' timestamp is inserted at every page +# bottom, using the given strftime format. +# The empty string is equivalent to '%b %d, %Y'. +# +# html_last_updated_fmt = None + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# +# html_additional_pages = {} + +# If false, no module index is generated. +# +# html_domain_indices = True + +# If false, no index is generated. +# +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' +# +# html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# 'ja' uses this config value. +# 'zh' user can custom change `jieba` dictionary path. +# +# html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +# +# html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'fiodoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'fio.tex', 'fio Documentation', + 'a', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# +# latex_use_parts = False + +# If true, show page references after internal links. +# +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# +# latex_appendices = [] + +# It false, will not define \strong, \code, itleref, \crossref ... but only +# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added +# packages. +# +# latex_keep_old_macro_names = True + +# If false, no module index is generated. +# +# latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('fio_man', 'fio', 'flexible I/O tester', + [author], 1) +] + +# If true, show URL addresses after external links. +# +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'fio', 'fio Documentation', + author, 'fio', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# +# texinfo_appendices = [] + +# If false, no module index is generated. +# +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# +# texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +# +# texinfo_no_detailmenu = False diff -Nru fio-2.16/doc/fio_doc.rst fio-3.1/doc/fio_doc.rst --- fio-2.16/doc/fio_doc.rst 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/doc/fio_doc.rst 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,51 @@ +fio - Flexible I/O tester rev. |version| +======================================== + + +.. include:: ../README + + +.. include:: ../HOWTO + + + +Examples +======== + +.. include:: fio_examples.rst + + + +TODO +==== + + +GFIO TODO +--------- + +.. include:: ../GFIO-TODO + + +Server TODO +----------- + +.. include:: ../SERVER-TODO + + +Steady State TODO +----------------- + +.. include:: ../STEADYSTATE-TODO + + + +Moral License +============= + +.. include:: ../MORAL-LICENSE + + +License +======= + +.. literalinclude:: ../COPYING diff -Nru fio-2.16/doc/fio_examples.rst fio-3.1/doc/fio_examples.rst --- fio-2.16/doc/fio_examples.rst 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/doc/fio_examples.rst 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,72 @@ +Some job file examples. + + +Poisson request flow +-------------------- + +.. only:: builder_html + +:download:`Download poisson-rate-submission.fio <../examples/poisson-rate-submission.fio>` + +.. literalinclude:: ../examples/poisson-rate-submission.fio + :language: ini + +Latency profile +--------------- + +.. only:: builder_html + +:download:`Download latency-profile.fio <../examples/latency-profile.fio>` + +.. literalinclude:: ../examples/latency-profile.fio + :language: ini + +Read 4 files with aio at different depths +----------------------------------------- + +.. only:: builder_html + +:download:`Download aio-read.fio <../examples/aio-read.fio>` + +.. literalinclude:: ../examples/aio-read.fio + :language: ini + +Read backwards in a file +------------------------ + +.. only:: builder_html + +:download:`Download backwards-read.fio <../examples/backwards-read.fio>` + +.. literalinclude:: ../examples/backwards-read.fio + :language: ini + +Basic verification +------------------ + +.. only:: builder_html + +:download:`Download basic-verify.fio <../examples/basic-verify.fio>` + +.. literalinclude:: ../examples/basic-verify.fio + :language: ini + +Fixed rate submission +--------------------- + +.. only:: builder_html + +:download:`Download fixed-rate-submission.fio <../examples/fixed-rate-submission.fio>` + +.. literalinclude:: ../examples/fixed-rate-submission.fio + :language: ini + +Butterfly seek pattern +----------------------- + +.. only:: builder_html + +:download:`Download butterfly.fio <../examples/butterfly.fio>` + +.. literalinclude:: ../examples/butterfly.fio + :language: ini diff -Nru fio-2.16/doc/fio_man.rst fio-3.1/doc/fio_man.rst --- fio-2.16/doc/fio_man.rst 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/doc/fio_man.rst 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,12 @@ +:orphan: + +Fio Manpage +=========== + +(rev. |release|) + + +.. include:: ../README + + +.. include:: ../HOWTO diff -Nru fio-2.16/doc/index.rst fio-3.1/doc/index.rst --- fio-2.16/doc/index.rst 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/doc/index.rst 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,25 @@ +.. FIO documentation master file, created by + sphinx-quickstart on Thu Mar 20 16:24:25 2015. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to FIO's documentation! +=============================== + +**Version:** |release| + +Contents: + +.. toctree:: + :maxdepth: 3 + :numbered: + + fio - Flexible I/O tester |version| + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` + diff -Nru fio-2.16/doc/make.bat fio-3.1/doc/make.bat --- fio-2.16/doc/make.bat 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/doc/make.bat 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,281 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. epub3 to make an epub3 + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + echo. dummy to check syntax errors of document sources + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 1>NUL 2>NUL +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\fio.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\fio.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "epub3" ( + %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3 + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub3 file is in %BUILDDIR%/epub3. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +if "%1" == "dummy" ( + %SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. Dummy builder generates no files. + goto end +) + +:end diff -Nru fio-2.16/doc/Makefile fio-3.1/doc/Makefile --- fio-2.16/doc/Makefile 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/doc/Makefile 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,225 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = output + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " epub3 to make an epub3" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + @echo " dummy to check syntax errors of document sources" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fio.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fio.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/fio" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/fio" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: epub3 +epub3: + $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 + @echo + @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." + +.PHONY: dummy +dummy: + $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy + @echo + @echo "Build finished. Dummy builder generates no files." diff -Nru fio-2.16/engines/binject.c fio-3.1/engines/binject.c --- fio-2.16/engines/binject.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/binject.c 2017-09-28 10:23:20.000000000 +0000 @@ -59,11 +59,12 @@ return 0; } -static unsigned int binject_read_commands(struct thread_data *td, void *p, +static unsigned int binject_read_commands(struct thread_data *td, void *buf, int left, int *err) { struct fio_file *f; int i, ret, events; + char *p = buf; one_more: events = 0; @@ -351,7 +352,7 @@ if (ret) return 1; - if (f->filetype != FIO_TYPE_BD) { + if (f->filetype != FIO_TYPE_BLOCK) { log_err("fio: binject only works with block devices\n"); goto err_close; } diff -Nru fio-2.16/engines/cpu.c fio-3.1/engines/cpu.c --- fio-2.16/engines/cpu.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/cpu.c 2017-09-28 10:23:20.000000000 +0000 @@ -22,7 +22,7 @@ .type = FIO_OPT_INT, .off1 = offsetof(struct cpu_options, cpuload), .help = "Use this percentage of CPU", - .category = FIO_OPT_C_GENERAL, + .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_INVALID, }, { @@ -34,7 +34,7 @@ .def = "50000", .parent = "cpuload", .hide = 1, - .category = FIO_OPT_C_GENERAL, + .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_INVALID, }, { @@ -44,7 +44,7 @@ .off1 = offsetof(struct cpu_options, exit_io_done), .help = "Exit when IO threads finish", .def = "0", - .category = FIO_OPT_C_GENERAL, + .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_INVALID, }, { diff -Nru fio-2.16/engines/dev-dax.c fio-3.1/engines/dev-dax.c --- fio-2.16/engines/dev-dax.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/dev-dax.c 2017-09-28 10:23:20.000000000 +0000 @@ -51,14 +51,14 @@ #include #include #include -#include #include +#include #include "../fio.h" #include "../verify.h" /* - * Limits us to 1GB of mapped files in total to model after + * Limits us to 1GiB of mapped files in total to model after * mmap engine behavior */ #define MMAP_TOTAL_SZ (1 * 1024 * 1024 * 1024UL) @@ -69,8 +69,6 @@ off_t devdax_off; }; -static void * (*pmem_memcpy_persist)(void *dest, const void *src, size_t len); - static int fio_devdax_file(struct thread_data *td, struct fio_file *f, size_t length, off_t off) { @@ -108,7 +106,7 @@ struct fio_devdax_data *fdd = FILE_ENG_DATA(f); if (io_u->buflen > f->real_file_size) { - log_err("fio: bs too big for dev-dax engine\n"); + log_err("dev-dax: bs too big for dev-dax engine\n"); return EIO; } @@ -212,29 +210,11 @@ static int fio_devdax_init(struct thread_data *td) { struct thread_options *o = &td->o; - const char *path; - void *dl; if ((o->rw_min_bs & page_mask) && (o->fsync_blocks || o->fdatasync_blocks)) { - log_err("fio: mmap options dictate a minimum block size of " - "%llu bytes\n", (unsigned long long) page_size); - return 1; - } - - path = getenv("FIO_PMEM_LIB"); - if (!path) - path = "libpmem.so"; - - dl = dlopen(path, RTLD_NOW | RTLD_NODELETE); - if (!dl) { - log_err("fio: unable to open libpmem: %s\n", dlerror()); - return 1; - } - - pmem_memcpy_persist = dlsym(dl, "pmem_memcpy_persist"); - if (!pmem_memcpy_persist) { - log_err("fio: unable to load libpmem: %s\n", dlerror()); + log_err("dev-dax: mmap options dictate a minimum block size of %llu bytes\n", + (unsigned long long) page_size); return 1; } @@ -292,8 +272,8 @@ rc = stat(f->file_name, &st); if (rc < 0) { - log_err("%s: failed to stat file %s: %d\n", - td->o.name, f->file_name, errno); + log_err("%s: failed to stat file %s (%s)\n", + td->o.name, f->file_name, strerror(errno)); return -errno; } @@ -302,8 +282,8 @@ rpath = realpath(spath, npath); if (!rpath) { - log_err("%s: realpath on %s failed: %d\n", - td->o.name, spath, errno); + log_err("%s: realpath on %s failed (%s)\n", + td->o.name, spath, strerror(errno)); return -errno; } @@ -318,15 +298,15 @@ sfile = fopen(spath, "r"); if (!sfile) { - log_err("%s: fopen on %s failed: %d\n", - td->o.name, spath, errno); + log_err("%s: fopen on %s failed (%s)\n", + td->o.name, spath, strerror(errno)); return 1; } rc = fscanf(sfile, "%lu", &size); if (rc < 0) { - log_err("%s: fscanf on %s failed: %d\n", - td->o.name, spath, errno); + log_err("%s: fscanf on %s failed (%s)\n", + td->o.name, spath, strerror(errno)); return 1; } diff -Nru fio-2.16/engines/e4defrag.c fio-3.1/engines/e4defrag.c --- fio-2.16/engines/e4defrag.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/e4defrag.c 2017-09-28 10:23:20.000000000 +0000 @@ -95,7 +95,7 @@ ed->donor_fd = open(donor_name, O_CREAT|O_WRONLY, 0644); if (ed->donor_fd < 0) { td_verror(td, errno, "io_queue_init"); - log_err("Can't open donor file %s err:%d", donor_name, ed->donor_fd); + log_err("Can't open donor file %s err:%d\n", donor_name, ed->donor_fd); free(ed); return 1; } @@ -172,8 +172,13 @@ len = io_u->xfer_buflen; if (len != io_u->xfer_buflen) { - io_u->resid = io_u->xfer_buflen - len; - io_u->error = 0; + if (len) { + io_u->resid = io_u->xfer_buflen - len; + io_u->error = 0; + } else { + /* access beyond i_size */ + io_u->error = EINVAL; + } } if (ret) io_u->error = errno; diff -Nru fio-2.16/engines/ftruncate.c fio-3.1/engines/ftruncate.c --- fio-2.16/engines/ftruncate.c 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/engines/ftruncate.c 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,56 @@ +/* + * ftruncate: ioengine for git://git.kernel.dk/fio.git + * + * IO engine that does regular truncates to simulate data transfer + * as fio ioengine. + * DDIR_WRITE does ftruncate + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "../fio.h" +#include "../filehash.h" + +static int fio_ftruncate_queue(struct thread_data *td, struct io_u *io_u) +{ + struct fio_file *f = io_u->file; + int ret; + fio_ro_check(td, io_u); + + if (io_u->ddir != DDIR_WRITE) { + io_u->error = EINVAL; + return FIO_Q_COMPLETED; + } + ret = ftruncate(f->fd, io_u->offset); + + if (ret) + io_u->error = errno; + + return FIO_Q_COMPLETED; +} + +static struct ioengine_ops ioengine = { + .name = "ftruncate", + .version = FIO_IOOPS_VERSION, + .queue = fio_ftruncate_queue, + .open_file = generic_open_file, + .close_file = generic_close_file, + .get_file_size = generic_get_file_size, + .flags = FIO_SYNCIO | FIO_FAKEIO +}; + +static void fio_init fio_syncio_register(void) +{ + register_ioengine(&ioengine); +} + +static void fio_exit fio_syncio_unregister(void) +{ + unregister_ioengine(&ioengine); +} diff -Nru fio-2.16/engines/glusterfs_async.c fio-3.1/engines/glusterfs_async.c --- fio-2.16/engines/glusterfs_async.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/glusterfs_async.c 2017-09-28 10:23:20.000000000 +0000 @@ -92,7 +92,7 @@ struct io_u *io_u = data; struct fio_gf_iou *iou = io_u->engine_data; - dprint(FD_IO, "%s ret %lu\n", __FUNCTION__, ret); + dprint(FD_IO, "%s ret %zd\n", __FUNCTION__, ret); iou->io_complete = 1; } diff -Nru fio-2.16/engines/glusterfs.c fio-3.1/engines/glusterfs.c --- fio-2.16/engines/glusterfs.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/glusterfs.c 2017-09-28 10:23:20.000000000 +0000 @@ -165,11 +165,11 @@ if (td_read(td)) { if (glfs_lstat(g->fs, f->file_name, &sb) || sb.st_size < f->real_file_size) { - dprint(FD_FILE, "fio extend file %s from %ld to %ld\n", - f->file_name, sb.st_size, f->real_file_size); + dprint(FD_FILE, "fio extend file %s from %jd to %" PRIu64 "\n", + f->file_name, (intmax_t) sb.st_size, f->real_file_size); ret = glfs_ftruncate(g->fd, f->real_file_size); if (ret) { - log_err("failed fio extend file %s to %ld\n", + log_err("failed fio extend file %s to %" PRIu64 "\n", f->file_name, f->real_file_size); } else { unsigned long long left; @@ -190,7 +190,7 @@ r = glfs_write(g->fd, b, bs, 0); dprint(FD_IO, - "fio write %d of %ld file %s\n", + "fio write %d of %" PRIu64 " file %s\n", r, f->real_file_size, f->file_name); diff -Nru fio-2.16/engines/glusterfs_sync.c fio-3.1/engines/glusterfs_sync.c --- fio-2.16/engines/glusterfs_sync.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/glusterfs_sync.c 2017-09-28 10:23:20.000000000 +0000 @@ -7,7 +7,7 @@ #include "gfapi.h" -#define LAST_POS(f) ((f)->engine_data) +#define LAST_POS(f) ((f)->engine_pos) static int fio_gf_prep(struct thread_data *td, struct io_u *io_u) { struct fio_file *f = io_u->file; diff -Nru fio-2.16/engines/guasi.c fio-3.1/engines/guasi.c --- fio-2.16/engines/guasi.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/guasi.c 2017-09-28 10:23:20.000000000 +0000 @@ -132,7 +132,7 @@ { int i; struct io_u *io_u; - struct timeval now; + struct timespec now; if (!fio_fill_issue_time(td)) return; diff -Nru fio-2.16/engines/libaio.c fio-3.1/engines/libaio.c --- fio-2.16/engines/libaio.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/libaio.c 2017-09-28 10:23:20.000000000 +0000 @@ -220,7 +220,7 @@ static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us, unsigned int nr) { - struct timeval now; + struct timespec now; unsigned int i; if (!fio_fill_issue_time(td)) @@ -241,7 +241,7 @@ struct libaio_data *ld = td->io_ops_data; struct iocb **iocbs; struct io_u **io_us; - struct timeval tv; + struct timespec ts; int ret, wait_start = 0; if (!ld->queued) @@ -282,9 +282,9 @@ break; } if (!wait_start) { - fio_gettime(&tv, NULL); + fio_gettime(&ts, NULL); wait_start = 1; - } else if (mtime_since_now(&tv) > 30000) { + } else if (mtime_since_now(&ts) > 30000) { log_err("fio: aio appears to be stalled, giving up\n"); break; } diff -Nru fio-2.16/engines/mmap.c fio-3.1/engines/mmap.c --- fio-2.16/engines/mmap.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/mmap.c 2017-09-28 10:23:20.000000000 +0000 @@ -15,7 +15,7 @@ #include "../verify.h" /* - * Limits us to 1GB of mapped files in total + * Limits us to 1GiB of mapped files in total */ #define MMAP_TOTAL_SZ (1 * 1024 * 1024 * 1024UL) @@ -67,7 +67,7 @@ } #ifdef FIO_MADV_FREE - if (f->filetype == FIO_TYPE_BD) + if (f->filetype == FIO_TYPE_BLOCK) (void) posix_madvise(fmd->mmap_ptr, fmd->mmap_sz, FIO_MADV_FREE); #endif diff -Nru fio-2.16/engines/mtd.c fio-3.1/engines/mtd.c --- fio-2.16/engines/mtd.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/mtd.c 2017-09-28 10:23:20.000000000 +0000 @@ -13,6 +13,7 @@ #include #include "../fio.h" +#include "../optgroup.h" #include "../verify.h" #include "../oslib/libmtd.h" @@ -22,6 +23,28 @@ struct mtd_dev_info info; }; +struct fio_mtd_options { + void *pad; /* avoid off1 == 0 */ + unsigned int skip_bad; +}; + +static struct fio_option options[] = { + { + .name = "skip_bad", + .lname = "Skip operations against bad blocks", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct fio_mtd_options, skip_bad), + .help = "Skip operations against known bad blocks.", + .hide = 1, + .def = "0", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_MTD, + }, + { + .name = NULL, + }, +}; + static int fio_mtd_maybe_mark_bad(struct thread_data *td, struct fio_mtd_data *fmd, struct io_u *io_u, int eb) @@ -55,6 +78,7 @@ { struct fio_file *f = io_u->file; struct fio_mtd_data *fmd = FILE_ENG_DATA(f); + struct fio_mtd_options *o = td->eo; int local_offs = 0; int ret; @@ -77,7 +101,7 @@ (int)fmd->info.eb_size - eb_offs); char *buf = ((char *)io_u->buf) + local_offs; - if (td->o.skip_bad) { + if (o->skip_bad) { ret = fio_mtd_is_bad(td, fmd, io_u, eb); if (ret == -1) break; @@ -190,6 +214,8 @@ .close_file = fio_mtd_close_file, .get_file_size = fio_mtd_get_file_size, .flags = FIO_SYNCIO | FIO_NOEXTEND, + .options = options, + .option_struct_size = sizeof(struct fio_mtd_options), }; static void fio_init fio_mtd_register(void) diff -Nru fio-2.16/engines/net.c fio-3.1/engines/net.c --- fio-2.16/engines/net.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/net.c 2017-09-28 10:23:20.000000000 +0000 @@ -1218,7 +1218,7 @@ return 1; } if (is_ipv6(o)) { - log_err("fio: IPv6 not supported for multicast network IO"); + log_err("fio: IPv6 not supported for multicast network IO\n"); close(fd); return 1; } @@ -1371,7 +1371,7 @@ } if (!td->io_ops_data) { - nd = malloc(sizeof(*nd));; + nd = malloc(sizeof(*nd)); memset(nd, 0, sizeof(*nd)); nd->listenfd = -1; diff -Nru fio-2.16/engines/null.c fio-3.1/engines/null.c --- fio-2.16/engines/null.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/null.c 2017-09-28 10:23:20.000000000 +0000 @@ -135,23 +135,21 @@ #ifdef FIO_EXTERNAL_ENGINE extern "C" { +static struct ioengine_ops ioengine; void get_ioengine(struct ioengine_ops **ioengine_ptr) { - struct ioengine_ops *ioengine; + *ioengine_ptr = &ioengine; - *ioengine_ptr = (struct ioengine_ops *) malloc(sizeof(struct ioengine_ops)); - ioengine = *ioengine_ptr; - - strcpy(ioengine->name, "cpp_null"); - ioengine->version = FIO_IOOPS_VERSION; - ioengine->queue = fio_null_queue; - ioengine->commit = fio_null_commit; - ioengine->getevents = fio_null_getevents; - ioengine->event = fio_null_event; - ioengine->init = fio_null_init; - ioengine->cleanup = fio_null_cleanup; - ioengine->open_file = fio_null_open; - ioengine->flags = FIO_DISKLESSIO | FIO_FAKEIO; + ioengine.name = "cpp_null"; + ioengine.version = FIO_IOOPS_VERSION; + ioengine.queue = fio_null_queue; + ioengine.commit = fio_null_commit; + ioengine.getevents = fio_null_getevents; + ioengine.event = fio_null_event; + ioengine.init = fio_null_init; + ioengine.cleanup = fio_null_cleanup; + ioengine.open_file = fio_null_open; + ioengine.flags = FIO_DISKLESSIO | FIO_FAKEIO; } } #endif /* FIO_EXTERNAL_ENGINE */ diff -Nru fio-2.16/engines/pmemblk.c fio-3.1/engines/pmemblk.c --- fio-2.16/engines/pmemblk.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/pmemblk.c 2017-09-28 10:23:20.000000000 +0000 @@ -27,11 +27,11 @@ * ioengine=pmemblk * * Other relevant settings: + * thread=1 REQUIRED * iodepth=1 * direct=1 - * thread=1 REQUIRED * unlink=1 - * filename=/pmem0/fiotestfile,BSIZE,FSIZEMB + * filename=/mnt/pmem0/fiotestfile,BSIZE,FSIZEMiB * * thread must be set to 1 for pmemblk as multiple processes cannot * open the same block pool file. @@ -39,23 +39,26 @@ * iodepth should be set to 1 as pmemblk is always synchronous. * Use numjobs to scale up. * - * direct=1 is implied as pmemblk is always direct. + * direct=1 is implied as pmemblk is always direct. A warning message + * is printed if this is not specified. + * + * unlink=1 removes the block pool file after testing, and is optional. * - * Can set unlink to 1 to remove the block pool file after testing. + * The pmem device must have a DAX-capable filesystem and be mounted + * with DAX enabled. filename must point to a file on that filesystem. + * + * Example: + * mkfs.xfs /dev/pmem0 + * mkdir /mnt/pmem0 + * mount -o dax /dev/pmem0 /mnt/pmem0 * * When specifying the filename, if the block pool file does not already - * exist, then the pmemblk engine can create the pool file if you specify + * exist, then the pmemblk engine creates the pool file if you specify * the block and file sizes. BSIZE is the block size in bytes. - * FSIZEMB is the pool file size in MB. + * FSIZEMB is the pool file size in MiB. * * See examples/pmemblk.fio for more. * - * libpmemblk.so - * By default, the pmemblk engine will let the system find the libpmemblk.so - * that it uses. You can use an alternative libpmemblk by setting the - * FIO_PMEMBLK_LIB environment variable to the full path to the desired - * libpmemblk.so. - * */ #include @@ -64,68 +67,15 @@ #include #include #include -#include #include +#include +#include #include "../fio.h" /* * libpmemblk */ -struct PMEMblkpool_s; -typedef struct PMEMblkpool_s PMEMblkpool; - -static PMEMblkpool *(*pmemblk_create) (const char *, size_t, size_t, mode_t); -static PMEMblkpool *(*pmemblk_open) (const char *, size_t); -static void (*pmemblk_close) (PMEMblkpool *); -static size_t(*pmemblk_nblock) (PMEMblkpool *); -static size_t(*pmemblk_bsize) (PMEMblkpool *); -static int (*pmemblk_read) (PMEMblkpool *, void *, off_t); -static int (*pmemblk_write) (PMEMblkpool *, const void *, off_t); - -int load_libpmemblk(const char *path) -{ - void *dl; - - if (!path) - path = "libpmemblk.so"; - - dl = dlopen(path, RTLD_NOW | RTLD_NODELETE); - if (!dl) - goto errorout; - - pmemblk_create = dlsym(dl, "pmemblk_create"); - if (!pmemblk_create) - goto errorout; - pmemblk_open = dlsym(dl, "pmemblk_open"); - if (!pmemblk_open) - goto errorout; - pmemblk_close = dlsym(dl, "pmemblk_close"); - if (!pmemblk_close) - goto errorout; - pmemblk_nblock = dlsym(dl, "pmemblk_nblock"); - if (!pmemblk_nblock) - goto errorout; - pmemblk_bsize = dlsym(dl, "pmemblk_bsize"); - if (!pmemblk_bsize) - goto errorout; - pmemblk_read = dlsym(dl, "pmemblk_read"); - if (!pmemblk_read) - goto errorout; - pmemblk_write = dlsym(dl, "pmemblk_write"); - if (!pmemblk_write) - goto errorout; - - return 0; - -errorout: - log_err("fio: unable to load libpmemblk: %s\n", dlerror()); - if (dl) - dlclose(dl); - - return -1; -} - typedef struct fio_pmemblk_file *fio_pmemblk_file_t; struct fio_pmemblk_file { @@ -136,10 +86,6 @@ size_t pmb_bsize; size_t pmb_nblocks; }; -#define FIOFILEPMBSET(_f, _v) do { \ - (_f)->engine_data = (uint64_t)(uintptr_t)(_v); \ -} while(0) -#define FIOFILEPMBGET(_f) ((fio_pmemblk_file_t)((_f)->engine_data)) static fio_pmemblk_file_t Cache; @@ -187,7 +133,7 @@ * level, we allow the block size and file size to be appended * to the file name: * - * path[,bsize,fsizemb] + * path[,bsize,fsizemib] * * note that we do not use the fio option "filesize" to dictate * the file size because we can only give libpmemblk the gross @@ -197,7 +143,7 @@ * the final path without the parameters is returned in ppath. * the block size and file size are returned in pbsize and fsize. * - * note that the user should specify the file size in MiB, but + * note that the user specifies the file size in MiB, but * we return bytes from here. */ static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize, @@ -206,7 +152,7 @@ char *path; char *s; uint64_t bsize; - uint64_t fsizemb; + uint64_t fsizemib; path = strdup(pathspec); if (!path) { @@ -216,14 +162,14 @@ /* extract sizes, if given */ s = strrchr(path, ','); - if (s && (fsizemb = strtoull(s + 1, NULL, 10))) { + if (s && (fsizemib = strtoull(s + 1, NULL, 10))) { *s = 0; s = strrchr(path, ','); if (s && (bsize = strtoull(s + 1, NULL, 10))) { *s = 0; *ppath = path; *pbsize = bsize; - *pfsize = fsizemb << 20; + *pfsize = fsizemib << 20; return; } } @@ -250,11 +196,6 @@ pmb = fio_pmemblk_cache_lookup(path); if (!pmb) { - /* load libpmemblk if needed */ - if (!pmemblk_open) - if (load_libpmemblk(getenv("FIO_PMEMBLK_LIB"))) - goto error; - pmb = malloc(sizeof(*pmb)); if (!pmb) goto error; @@ -267,9 +208,8 @@ pmemblk_create(path, bsize, fsize, 0644); } if (!pmb->pmb_pool) { - log_err - ("fio: enable to open pmemblk pool file (errno %d)\n", - errno); + log_err("pmemblk: unable to open pmemblk pool file %s (%s)\n", + path, strerror(errno)); goto error; } @@ -331,14 +271,14 @@ if (!td->o.use_thread) { if (!thread_warned) { thread_warned = 1; - log_err("fio: must set thread=1 for pmemblk engine\n"); + log_err("pmemblk: must set thread=1 for pmemblk engine\n"); } return 1; } if (!td->o.odirect && !odirect_warned) { odirect_warned = 1; - log_info("fio: direct == 0, but pmemblk is always direct\n"); + log_info("pmemblk: direct == 0, but pmemblk is always direct\n"); } if (td->o.allow_create) @@ -360,26 +300,26 @@ if (!pmb) return 1; - FIOFILEPMBSET(f, pmb); + FILE_SET_ENG_DATA(f, pmb); return 0; } static int fio_pmemblk_close_file(struct thread_data fio_unused *td, struct fio_file *f) { - fio_pmemblk_file_t pmb = FIOFILEPMBGET(f); + fio_pmemblk_file_t pmb = FILE_ENG_DATA(f); if (pmb) pmb_close(pmb, false); - FIOFILEPMBSET(f, NULL); + FILE_SET_ENG_DATA(f, NULL); return 0; } static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f) { uint64_t flags = 0; - fio_pmemblk_file_t pmb = FIOFILEPMBGET(f); + fio_pmemblk_file_t pmb = FILE_ENG_DATA(f); if (fio_file_size_known(f)) return 0; @@ -396,7 +336,7 @@ fio_file_set_size_known(f); - if (!FIOFILEPMBGET(f)) + if (!FILE_ENG_DATA(f)) pmb_close(pmb, true); return 0; @@ -405,19 +345,16 @@ static int fio_pmemblk_queue(struct thread_data *td, struct io_u *io_u) { struct fio_file *f = io_u->file; - fio_pmemblk_file_t pmb = FIOFILEPMBGET(f); + fio_pmemblk_file_t pmb = FILE_ENG_DATA(f); unsigned long long off; unsigned long len; void *buf; - int (*blkop) (PMEMblkpool *, void *, off_t) = (void *)pmemblk_write; fio_ro_check(td, io_u); switch (io_u->ddir) { case DDIR_READ: - blkop = pmemblk_read; - /* fall through */ case DDIR_WRITE: off = io_u->offset; len = io_u->xfer_buflen; @@ -435,7 +372,11 @@ off /= pmb->pmb_bsize; len /= pmb->pmb_bsize; while (0 < len) { - if (0 != blkop(pmb->pmb_pool, buf, off)) { + if (io_u->ddir == DDIR_READ && + 0 != pmemblk_read(pmb->pmb_pool, buf, off)) { + io_u->error = errno; + break; + } else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) { io_u->error = errno; break; } @@ -482,7 +423,7 @@ return 0; } -struct ioengine_ops ioengine = { +static struct ioengine_ops ioengine = { .name = "pmemblk", .version = FIO_IOOPS_VERSION, .queue = fio_pmemblk_queue, diff -Nru fio-2.16/engines/rbd.c fio-3.1/engines/rbd.c --- fio-2.16/engines/rbd.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/rbd.c 2017-09-28 10:23:20.000000000 +0000 @@ -36,6 +36,7 @@ struct io_u **aio_events; struct io_u **sort_events; int fd; /* add for poll */ + bool connected; }; struct rbd_options { @@ -111,6 +112,8 @@ if (!rbd) goto failed; + rbd->connected = false; + /* add for poll, init fd: -1 */ rbd->fd = -1; @@ -287,7 +290,7 @@ */ ret = rbd_aio_get_return_value(fri->completion); if (ret < 0) { - io_u->error = ret; + io_u->error = -ret; io_u->resid = io_u->xfer_buflen; } else io_u->error = 0; @@ -514,6 +517,7 @@ } else { dprint(FD_IO, "%s: Warning: unhandled ddir: %d\n", __func__, io_u->ddir); + r = -EINVAL; goto failed_comp; } @@ -521,7 +525,7 @@ failed_comp: rbd_aio_release(fri->completion); failed: - io_u->error = r; + io_u->error = -r; td_verror(td, io_u->error, "xfer"); return FIO_Q_COMPLETED; } @@ -529,6 +533,10 @@ static int fio_rbd_init(struct thread_data *td) { int r; + struct rbd_data *rbd = td->io_ops_data; + + if (rbd->connected) + return 0; r = _fio_rbd_connect(td); if (r) { @@ -559,13 +567,8 @@ rbd_image_info_t info; struct fio_file *f; struct rbd_data *rbd = NULL; - int major, minor, extra; int r; - /* log version of librbd. No cluster connection required. */ - rbd_version(&major, &minor, &extra); - log_info("rbd engine: RBD version: %d.%d.%d\n", major, minor, extra); - /* allocate engine specific structure to deal with librbd. */ r = _fio_setup_rbd_data(td, &rbd); if (r) { @@ -589,19 +592,20 @@ log_err("fio_rbd_connect failed.\n"); goto cleanup; } + rbd->connected = true; /* get size of the RADOS block device */ r = rbd_stat(rbd->image, &info, sizeof(info)); if (r < 0) { log_err("rbd_status failed.\n"); - goto disconnect; + goto cleanup; } else if (info.size == 0) { log_err("image size should be larger than zero.\n"); r = -EINVAL; - goto disconnect; + goto cleanup; } - dprint(FD_IO, "rbd-engine: image size: %lu\n", info.size); + dprint(FD_IO, "rbd-engine: image size: %" PRIu64 "\n", info.size); /* taken from "net" engine. Pretend we deal with files, * even if we do not have any ideas about files. @@ -615,14 +619,8 @@ f = td->files[0]; f->real_file_size = info.size; - /* disconnect, then we were only connected to determine - * the size of the RBD. - */ - _fio_rbd_disconnect(rbd); return 0; -disconnect: - _fio_rbd_disconnect(rbd); cleanup: fio_rbd_cleanup(td); return r; diff -Nru fio-2.16/engines/rdma.c fio-3.1/engines/rdma.c --- fio-2.16/engines/rdma.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/rdma.c 2017-09-28 10:23:20.000000000 +0000 @@ -44,7 +44,6 @@ #include "../optgroup.h" #include -#include #define FIO_RDMA_MAX_IO_DEPTH 512 @@ -216,7 +215,7 @@ rd->rmt_nr = ntohl(rd->recv_buf.nr); for (i = 0; i < rd->rmt_nr; i++) { - rd->rmt_us[i].buf = ntohll(rd->recv_buf.rmt_us[i].buf); + rd->rmt_us[i].buf = be64_to_cpu(rd->recv_buf.rmt_us[i].buf); rd->rmt_us[i].rkey = ntohl(rd->recv_buf.rmt_us[i].rkey); rd->rmt_us[i].size = ntohl(rd->recv_buf.rmt_us[i].size); @@ -802,7 +801,7 @@ unsigned int nr) { struct rdmaio_data *rd = td->io_ops_data; - struct timeval now; + struct timespec now; unsigned int i; if (!fio_fill_issue_time(td)) @@ -881,7 +880,7 @@ rd->send_buf.nr = htonl(td->o.iodepth); if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) { - log_err("fio: ibv_post_send fail: %m"); + log_err("fio: ibv_post_send fail: %m\n"); return 1; } @@ -932,7 +931,7 @@ ret = rdma_poll_wait(td, IBV_WC_RECV) < 0; if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) { - log_err("fio: ibv_post_send fail: %m"); + log_err("fio: ibv_post_send fail: %m\n"); return 1; } @@ -965,7 +964,7 @@ || (rd->rdma_protocol == FIO_RDMA_MEM_READ))) { if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) { - log_err("fio: ibv_post_send fail: %m"); + log_err("fio: ibv_post_send fail: %m\n"); return 1; } @@ -1300,7 +1299,7 @@ } rd->send_buf.rmt_us[i].buf = - htonll((uint64_t) (unsigned long)io_u->buf); + cpu_to_be64((uint64_t) (unsigned long)io_u->buf); rd->send_buf.rmt_us[i].rkey = htonl(io_u->mr->rkey); rd->send_buf.rmt_us[i].size = htonl(max_bs); diff -Nru fio-2.16/engines/sg.c fio-3.1/engines/sg.c --- fio-2.16/engines/sg.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/sg.c 2017-09-28 10:23:20.000000000 +0000 @@ -20,7 +20,7 @@ #define MAX_SB 64 // sense block maximum return size struct sgio_cmd { - unsigned char cdb[16]; // increase to support 16 byte commands + unsigned char cdb[16]; // enhanced from 10 to support 16 byte commands unsigned char sb[MAX_SB]; // add sense block to commands int nr; }; @@ -32,7 +32,6 @@ int *fd_flags; void *sgbuf; unsigned int bs; - long long max_lba; int type_checked; }; @@ -125,7 +124,7 @@ } while (left) { - void *p; + char *p; dprint(FD_IO, "sgio_getevents: sd %p: left=%d\n", sd, left); @@ -185,7 +184,7 @@ if (hdr->info & SG_INFO_CHECK) { struct io_u *io_u; io_u = (struct io_u *)(hdr->usr_ptr); - memcpy((void*)&(io_u->hdr), (void*)hdr, sizeof(struct sg_io_hdr)); + memcpy(&io_u->hdr, hdr, sizeof(struct sg_io_hdr)); sd->events[i]->error = EIO; } } @@ -253,7 +252,7 @@ struct fio_file *f = io_u->file; int ret; - if (f->filetype == FIO_TYPE_BD) { + if (f->filetype == FIO_TYPE_BLOCK) { ret = fio_sgio_ioctl_doio(td, f, io_u); td->error = io_u->error; } else { @@ -309,7 +308,6 @@ * blocks on medium. */ if (hdr->dxfer_direction != SG_DXFER_NONE) { - if (lba < MAX_10B_LBA) { hdr->cmdp[2] = (unsigned char) ((lba >> 24) & 0xff); hdr->cmdp[3] = (unsigned char) ((lba >> 16) & 0xff); @@ -416,17 +414,16 @@ } *bs = (buf[4] << 24) | (buf[5] << 16) | (buf[6] << 8) | buf[7]; - *max_lba = ((buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]) & 0x00000000FFFFFFFFULL; // for some reason max_lba is being sign extended even though unsigned. - + *max_lba = ((buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]) & MAX_10B_LBA; // for some reason max_lba is being sign extended even though unsigned. /* - * If max lba is 0xFFFFFFFF, then need to retry with - * 16 byteread capacity + * If max lba masked by MAX_10B_LBA equals MAX_10B_LBA, + * then need to retry with 16 byte Read Capacity command. */ if (*max_lba == MAX_10B_LBA) { hdr.cmd_len = 16; - hdr.cmdp[0] = 0x9e; // Read Capacity(16) - hdr.cmdp[1] = 0x10; // service action + hdr.cmdp[0] = 0x9e; // service action + hdr.cmdp[1] = 0x10; // Read Capacity(16) hdr.cmdp[10] = (unsigned char) ((sizeof(buf) >> 24) & 0xff); hdr.cmdp[11] = (unsigned char) ((sizeof(buf) >> 16) & 0xff); hdr.cmdp[12] = (unsigned char) ((sizeof(buf) >> 8) & 0xff); @@ -507,8 +504,7 @@ unsigned int bs = 0; unsigned long long max_lba = 0; - - if (f->filetype == FIO_TYPE_BD) { + if (f->filetype == FIO_TYPE_BLOCK) { if (ioctl(f->fd, BLKSSZGET, &bs) < 0) { td_verror(td, errno, "ioctl"); return 1; @@ -529,19 +525,19 @@ } } else { td_verror(td, EINVAL, "wrong file type"); - log_err("ioengine sg only works on block devices\n"); + log_err("ioengine sg only works on block or character devices\n"); return 1; } sd->bs = bs; // Determine size of commands needed based on max_lba - sd->max_lba = max_lba; - if (max_lba > MAX_10B_LBA) { - dprint(FD_IO, "sgio_type_check: using 16 byte operations: max_lba = 0x%016llx\n", max_lba); + if (max_lba >= MAX_10B_LBA) { + dprint(FD_IO, "sgio_type_check: using 16 byte read/write " + "commands for lba above 0x%016llx/0x%016llx\n", + MAX_10B_LBA, max_lba); } - - if (f->filetype == FIO_TYPE_BD) { + if (f->filetype == FIO_TYPE_BLOCK) { td->io_ops->getevents = NULL; td->io_ops->event = NULL; } @@ -576,17 +572,17 @@ struct sg_io_hdr *hdr = &io_u->hdr; #define MAXERRDETAIL 1024 #define MAXMSGCHUNK 128 - char *msg, msgchunk[MAXMSGCHUNK], *ret = NULL; + char *msg, msgchunk[MAXMSGCHUNK]; int i; - msg = calloc(MAXERRDETAIL, 1); + msg = calloc(1, MAXERRDETAIL); + strcpy(msg, ""); /* * can't seem to find sg_err.h, so I'll just echo the define values * so others can search on internet to find clearer clues of meaning. */ if (hdr->info & SG_INFO_CHECK) { - ret = msg; if (hdr->host_status) { snprintf(msgchunk, MAXMSGCHUNK, "SG Host Status: 0x%02x; ", hdr->host_status); strlcat(msg, msgchunk, MAXERRDETAIL); @@ -630,6 +626,24 @@ case 0x0d: strlcat(msg, "SG_ERR_DID_REQUEUE", MAXERRDETAIL); break; + case 0x0e: + strlcat(msg, "SG_ERR_DID_TRANSPORT_DISRUPTED", MAXERRDETAIL); + break; + case 0x0f: + strlcat(msg, "SG_ERR_DID_TRANSPORT_FAILFAST", MAXERRDETAIL); + break; + case 0x10: + strlcat(msg, "SG_ERR_DID_TARGET_FAILURE", MAXERRDETAIL); + break; + case 0x11: + strlcat(msg, "SG_ERR_DID_NEXUS_FAILURE", MAXERRDETAIL); + break; + case 0x12: + strlcat(msg, "SG_ERR_DID_ALLOC_FAILURE", MAXERRDETAIL); + break; + case 0x13: + strlcat(msg, "SG_ERR_DID_MEDIUM_ERROR", MAXERRDETAIL); + break; default: strlcat(msg, "Unknown", MAXERRDETAIL); break; @@ -741,14 +755,14 @@ if (hdr->resid != 0) { snprintf(msgchunk, MAXMSGCHUNK, "SG Driver: %d bytes out of %d not transferred. ", hdr->resid, hdr->dxfer_len); strlcat(msg, msgchunk, MAXERRDETAIL); - ret = msg; } } - if (!ret) - ret = strdup("SG Driver did not report a Host, Driver or Device check"); + if (!(hdr->info & SG_INFO_CHECK) && !strlen(msg)) + strncpy(msg, "SG Driver did not report a Host, Driver or Device check", + MAXERRDETAIL - 1); - return ret; + return msg; } /* @@ -775,6 +789,12 @@ if (fio_file_size_known(f)) return 0; + if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) { + td_verror(td, EINVAL, "wrong file type"); + log_err("ioengine sg only works on block or character devices\n"); + return 1; + } + ret = fio_sgio_read_capacity(td, &bs, &max_lba); if (ret ) { td_verror(td, td->error, "fio_sgio_read_capacity"); @@ -800,7 +820,7 @@ .cleanup = fio_sgio_cleanup, .open_file = fio_sgio_open, .close_file = generic_close_file, - .get_file_size = fio_sgio_get_file_size, // generic_get_file_size + .get_file_size = fio_sgio_get_file_size, .flags = FIO_SYNCIO | FIO_RAWIO, }; diff -Nru fio-2.16/engines/skeleton_external.c fio-3.1/engines/skeleton_external.c --- fio-2.16/engines/skeleton_external.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/skeleton_external.c 2017-09-28 10:23:20.000000000 +0000 @@ -3,7 +3,8 @@ * * Should be compiled with: * - * gcc -Wall -O2 -g -shared -rdynamic -fPIC -o engine.o engine.c + * gcc -Wall -O2 -g -shared -rdynamic -fPIC -o skeleton_external.o skeleton_external.c + * (also requires -D_GNU_SOURCE -DCONFIG_STRSEP on Linux) * */ #include @@ -13,6 +14,7 @@ #include #include "../fio.h" +#include "../optgroup.h" /* * The core of the module is identical to the ones included with fio, @@ -21,6 +23,32 @@ */ /* + * The io engine can define its own options within the io engine source. + * The option member must not be at offset 0, due to the way fio parses + * the given option. Just add a padding pointer unless the io engine has + * something usable. + */ +struct fio_skeleton_options { + void *pad; /* avoid ->off1 of fio_option becomes 0 */ + unsigned int dummy; +}; + +static struct fio_option options[] = { + { + .name = "dummy", + .lname = "ldummy", + .type = FIO_OPT_STR_SET, + .off1 = offsetof(struct fio_skeleton_options, dummy), + .help = "Set dummy", + .category = FIO_OPT_C_ENGINE, /* always use this */ + .group = FIO_OPT_G_INVALID, /* this can be different */ + }, + { + .name = NULL, + }, +}; + +/* * The ->event() hook is called to match an event number with an io_u. * After the core has called ->getevents() and it has returned eg 3, * the ->event() hook must return the 3 events that have completed for @@ -109,11 +137,11 @@ /* * Hook for opening the given file. Unless the engine has special - * needs, it usually just provides generic_file_open() as the handler. + * needs, it usually just provides generic_open_file() as the handler. */ static int fio_skeleton_open(struct thread_data *td, struct fio_file *f) { - return generic_file_open(td, f); + return generic_open_file(td, f); } /* @@ -121,12 +149,12 @@ */ static int fio_skeleton_close(struct thread_data *td, struct fio_file *f) { - generic_file_close(td, f); + return generic_close_file(td, f); } /* * Note that the structure is exported, so that fio can get it via - * dlsym(..., "ioengine"); + * dlsym(..., "ioengine"); for (and only for) external engines. */ struct ioengine_ops ioengine = { .name = "engine_name", @@ -140,4 +168,6 @@ .cleanup = fio_skeleton_cleanup, .open_file = fio_skeleton_open, .close_file = fio_skeleton_close, + .options = options, + .option_struct_size = sizeof(struct fio_skeleton_options), }; diff -Nru fio-2.16/engines/splice.c fio-3.1/engines/splice.c --- fio-2.16/engines/splice.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/splice.c 2017-09-28 10:23:20.000000000 +0000 @@ -32,7 +32,7 @@ struct fio_file *f = io_u->file; int ret, ret2, buflen; off_t offset; - void *p; + char *p; offset = io_u->offset; buflen = io_u->xfer_buflen; @@ -77,7 +77,8 @@ struct iovec iov; int ret , buflen, mmap_len; off_t offset; - void *p, *map; + void *map; + char *p; ret = 0; offset = io_u->offset; diff -Nru fio-2.16/engines/sync.c fio-3.1/engines/sync.c --- fio-2.16/engines/sync.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/sync.c 2017-09-28 10:23:20.000000000 +0000 @@ -14,11 +14,12 @@ #include "../fio.h" #include "../optgroup.h" +#include "../lib/rand.h" /* * Sync engine uses engine_data to store last offset */ -#define LAST_POS(f) ((f)->engine_data) +#define LAST_POS(f) ((f)->engine_pos) struct syncio_data { struct iovec *iovecs; @@ -30,12 +31,15 @@ unsigned long long last_offset; struct fio_file *last_file; enum fio_ddir last_ddir; + + struct frand_state rand_state; }; #ifdef FIO_HAVE_PWRITEV2 struct psyncv2_options { void *pad; unsigned int hipri; + unsigned int hipri_percentage; }; static struct fio_option options[] = { @@ -49,6 +53,18 @@ .group = FIO_OPT_G_INVALID, }, { + .name = "hipri_percentage", + .lname = "RWF_HIPRI_PERCENTAGE", + .type = FIO_OPT_INT, + .off1 = offsetof(struct psyncv2_options, hipri_percentage), + .minval = 0, + .maxval = 100, + .def = "100", + .help = "Probabilistically set RWF_HIPRI for pwritev2/preadv2", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { .name = NULL, }, }; @@ -132,7 +148,8 @@ fio_ro_check(td, io_u); - if (o->hipri) + if (o->hipri && + (rand32_between(&sd->rand_state, 1, 100) <= o->hipri_percentage)) flags |= RWF_HIPRI; iov->iov_base = io_u->xfer_buf; @@ -363,6 +380,7 @@ sd->last_offset = -1ULL; sd->iovecs = malloc(td->o.iodepth * sizeof(struct iovec)); sd->io_us = malloc(td->o.iodepth * sizeof(struct io_u *)); + init_rand(&sd->rand_state, 0); td->io_ops_data = sd; return 0; diff -Nru fio-2.16/engines/windowsaio.c fio-3.1/engines/windowsaio.c --- fio-2.16/engines/windowsaio.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/engines/windowsaio.c 2017-09-28 10:23:20.000000000 +0000 @@ -35,17 +35,7 @@ struct windowsaio_data *wd; }; -static BOOL timeout_expired(DWORD start_count, DWORD end_count); -static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min, - unsigned int max, const struct timespec *t); -static struct io_u *fio_windowsaio_event(struct thread_data *td, int event); -static int fio_windowsaio_queue(struct thread_data *td, - struct io_u *io_u); -static void fio_windowsaio_cleanup(struct thread_data *td); static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter); -static int fio_windowsaio_init(struct thread_data *td); -static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f); -static int fio_windowsaio_close_file(struct thread_data fio_unused *td, struct fio_file *f); static int fio_windowsaio_init(struct thread_data *td) { @@ -152,7 +142,6 @@ } } - static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f) { int rc = 0; @@ -180,13 +169,26 @@ /* * Inform Windows whether we're going to be doing sequential or - * random io so it can tune the Cache Manager + * random IO so it can tune the Cache Manager */ - if (td->o.td_ddir == TD_DDIR_READ || - td->o.td_ddir == TD_DDIR_WRITE) - flags |= FILE_FLAG_SEQUENTIAL_SCAN; - else + switch (td->o.fadvise_hint) { + case F_ADV_TYPE: + if (td_random(td)) + flags |= FILE_FLAG_RANDOM_ACCESS; + else + flags |= FILE_FLAG_SEQUENTIAL_SCAN; + break; + case F_ADV_RANDOM: flags |= FILE_FLAG_RANDOM_ACCESS; + break; + case F_ADV_SEQUENTIAL: + flags |= FILE_FLAG_SEQUENTIAL_SCAN; + break; + case F_ADV_NONE: + break; + default: + log_err("fio: unknown fadvise type %d\n", td->o.fadvise_hint); + } if (!td_write(td) || read_only) access = GENERIC_READ; diff -Nru fio-2.16/eta.c fio-3.1/eta.c --- fio-2.16/eta.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/eta.c 2017-09-28 10:23:20.000000000 +0000 @@ -308,7 +308,7 @@ diff = io_bytes[i] - prev_io_bytes[i]; if (mtime) - this_rate = ((1000 * diff) / mtime) / 1024; + this_rate = ((1000 * diff) / mtime) / 1024; /* KiB/s */ else this_rate = 0; @@ -358,12 +358,12 @@ uint64_t rate_time, disp_time, bw_avg_time, *eta_secs; unsigned long long io_bytes[DDIR_RWDIR_CNT]; unsigned long long io_iops[DDIR_RWDIR_CNT]; - struct timeval now; + struct timespec now; static unsigned long long rate_io_bytes[DDIR_RWDIR_CNT]; static unsigned long long disp_io_bytes[DDIR_RWDIR_CNT]; static unsigned long long disp_io_iops[DDIR_RWDIR_CNT]; - static struct timeval rate_prev_time, disp_prev_time; + static struct timespec rate_prev_time, disp_prev_time; if (!force) { if (!(output_format & FIO_OUTPUT_NORMAL) && @@ -440,7 +440,7 @@ if (td->runstate > TD_SETTING_UP) { int ddir; - for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) { + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) { if (unified_rw_rep) { io_bytes[0] += td->io_bytes[ddir]; io_iops[0] += td->io_blocks[ddir]; @@ -511,7 +511,7 @@ void display_thread_status(struct jobs_eta *je) { - static struct timeval disp_eta_new_line; + static struct timespec disp_eta_new_line; static int eta_new_line_init, eta_new_line_pending; static int linelen_last; static int eta_good; @@ -530,19 +530,28 @@ } p += sprintf(p, "Jobs: %d (f=%d)", je->nr_running, je->files_open); - if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) { + + /* rate limits, if any */ + if (je->m_rate[0] || je->m_rate[1] || je->m_rate[2] || + je->t_rate[0] || je->t_rate[1] || je->t_rate[2]) { char *tr, *mr; - mr = num2str(je->m_rate[0] + je->m_rate[1], 4, 0, je->is_pow2, 8); - tr = num2str(je->t_rate[0] + je->t_rate[1], 4, 0, je->is_pow2, 8); - p += sprintf(p, ", CR=%s/%s KB/s", tr, mr); + mr = num2str(je->m_rate[0] + je->m_rate[1] + je->m_rate[2], + 4, 0, je->is_pow2, N2S_BYTEPERSEC); + tr = num2str(je->t_rate[0] + je->t_rate[1] + je->t_rate[2], + 4, 0, je->is_pow2, N2S_BYTEPERSEC); + + p += sprintf(p, ", %s-%s", mr, tr); free(tr); free(mr); - } else if (je->m_iops[0] || je->m_iops[1] || je->t_iops[0] || je->t_iops[1]) { - p += sprintf(p, ", CR=%d/%d IOPS", - je->t_iops[0] + je->t_iops[1], - je->m_iops[0] + je->m_iops[1]); + } else if (je->m_iops[0] || je->m_iops[1] || je->m_iops[2] || + je->t_iops[0] || je->t_iops[1] || je->t_iops[2]) { + p += sprintf(p, ", %d-%d IOPS", + je->m_iops[0] + je->m_iops[1] + je->m_iops[2], + je->t_iops[0] + je->t_iops[1] + je->t_iops[2]); } + + /* current run string, % done, bandwidth, iops, eta */ if (je->eta_sec != INT_MAX && je->nr_running) { char perc_str[32]; char *iops_str[DDIR_RWDIR_CNT]; @@ -553,7 +562,7 @@ if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running || je->eta_sec == -1) - strcpy(perc_str, "-.-% done"); + strcpy(perc_str, "-.-%"); else { double mult = 100.0; @@ -562,28 +571,37 @@ eta_good = 1; perc *= mult; - sprintf(perc_str, "%3.1f%% done", perc); + sprintf(perc_str, "%3.1f%%", perc); } - for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) { - rate_str[ddir] = num2str(je->rate[ddir], 5, + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) { + rate_str[ddir] = num2str(je->rate[ddir], 4, 1024, je->is_pow2, je->unit_base); - iops_str[ddir] = num2str(je->iops[ddir], 4, 1, 0, 0); + iops_str[ddir] = num2str(je->iops[ddir], 4, 1, 0, N2S_NONE); } left = sizeof(output) - (p - output) - 1; - l = snprintf(p, left, ": [%s] [%s] [%s/%s/%s /s] [%s/%s/%s iops] [eta %s]", + if (je->rate[DDIR_TRIM] || je->iops[DDIR_TRIM]) + l = snprintf(p, left, + ": [%s][%s][r=%s,w=%s,t=%s][r=%s,w=%s,t=%s IOPS][eta %s]", je->run_str, perc_str, rate_str[DDIR_READ], rate_str[DDIR_WRITE], rate_str[DDIR_TRIM], iops_str[DDIR_READ], iops_str[DDIR_WRITE], iops_str[DDIR_TRIM], eta_str); + else + l = snprintf(p, left, + ": [%s][%s][r=%s,w=%s][r=%s,w=%s IOPS][eta %s]", + je->run_str, perc_str, + rate_str[DDIR_READ], rate_str[DDIR_WRITE], + iops_str[DDIR_READ], iops_str[DDIR_WRITE], + eta_str); p += l; if (l >= 0 && l < linelen_last) p += sprintf(p, "%*s", linelen_last - l, ""); linelen_last = l; - for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) { + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) { free(rate_str[ddir]); free(iops_str[ddir]); } diff -Nru fio-2.16/examples/butterfly.fio fio-3.1/examples/butterfly.fio --- fio-2.16/examples/butterfly.fio 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/examples/butterfly.fio 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,19 @@ +# Perform a butterfly/funnel seek pattern. This won't always alternate ends on +# every I/O but it will get close. + +[global] +filename=/tmp/testfile +bs=4k +direct=1 + +[forward] +rw=read +flow=2 +# Uncomment the size= and offset= lines to prevent each direction going past +# the middle of the file +#size=50% + +[backward] +rw=read:-8k +flow=-2 +#offset=50% diff -Nru fio-2.16/examples/ftruncate.fio fio-3.1/examples/ftruncate.fio --- fio-2.16/examples/ftruncate.fio 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/examples/ftruncate.fio 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,27 @@ +# Example ftruncate engine jobs + +[global] +ioengine=ftruncate +directory=/scratch +size=102404k ; 100Mb+4k +stonewall +filename=truncate +runtime=10s +time_based +direct=1 +# +# bs option is stub here. Truncation is performed on the current block offset. +# blocksize value is ignored +bs=4k + +# truncate the file to 4Kbytes then repeatedly grow the file back to just over +# its original size using subsequent truncates +[grow-truncate] +rw=write + +# Repeatedly change a file to a random size between 0Kbytes and 100Mb +# using truncates +[rand-truncate] +rw=randwrite +norandommap + diff -Nru fio-2.16/examples/gpudirect-rdmaio-client.fio fio-3.1/examples/gpudirect-rdmaio-client.fio --- fio-2.16/examples/gpudirect-rdmaio-client.fio 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/examples/gpudirect-rdmaio-client.fio 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,15 @@ +# Example gpudirect rdma client job +[global] +ioengine=rdma +hostname=[hostname] +port=[port] +verb=[read/write/send/recv] +mem=cudamalloc +gpu_dev_id=0 +bs=1m +size=100g + +[sender] +rw=write +iodepth=1 +iodepth_batch_complete=1 diff -Nru fio-2.16/examples/gpudirect-rdmaio-server.fio fio-3.1/examples/gpudirect-rdmaio-server.fio --- fio-2.16/examples/gpudirect-rdmaio-server.fio 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/examples/gpudirect-rdmaio-server.fio 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,12 @@ +# Example rdma server job +[global] +ioengine=rdma +port=[port] +mem=cudamalloc +gpu_dev_id=0 +bs=1m +size=100g + +[receiver] +rw=read +iodepth=16 diff -Nru fio-2.16/examples/mtd.fio fio-3.1/examples/mtd.fio --- fio-2.16/examples/mtd.fio 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/examples/mtd.fio 2017-09-28 10:23:20.000000000 +0000 @@ -17,5 +17,5 @@ [write] stonewall block_error_percentiles=1 -rw=writetrim +rw=trimwrite loops=4 diff -Nru fio-2.16/exp/README.md fio-3.1/exp/README.md --- fio-2.16/exp/README.md 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/exp/README.md 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -simple-expression-parser -======================== - -A simple expression parser for arithmetic expressions made with bison + flex - -To use, see the example test-expression-parser.c - diff -Nru fio-2.16/file.h fio-3.1/file.h --- fio-2.16/file.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/file.h 2017-09-28 10:23:20.000000000 +0000 @@ -15,7 +15,7 @@ */ enum fio_filetype { FIO_TYPE_FILE = 1, /* plain file */ - FIO_TYPE_BD, /* block device */ + FIO_TYPE_BLOCK, /* block device */ FIO_TYPE_CHAR, /* character device */ FIO_TYPE_PIPE, /* pipe */ }; @@ -63,6 +63,7 @@ FIO_FALLOCATE_NONE = 1, FIO_FALLOCATE_POSIX = 2, FIO_FALLOCATE_KEEP_SIZE = 3, + FIO_FALLOCATE_NATIVE = 4, }; /* @@ -90,6 +91,7 @@ /* * size of the file, offset into file, and io size from that offset + * (be aware io_size is different from thread_options::io_size) */ uint64_t real_file_size; uint64_t file_offset; @@ -112,9 +114,12 @@ unsigned int last_write_idx; /* - * For use by the io engine + * For use by the io engine for offset or private data storage */ - uint64_t engine_data; + union { + uint64_t engine_pos; + void *engine_data; + }; /* * if io is protected by a semaphore, this is set @@ -146,14 +151,8 @@ struct disk_util *du; }; -#define FILE_ENG_DATA(f) ((void *) (uintptr_t) (f)->engine_data) -#define FILE_SET_ENG_DATA(f, data) \ - ((f)->engine_data = (uintptr_t) (data)) - -struct file_name { - struct flist_head list; - char *filename; -}; +#define FILE_ENG_DATA(f) ((f)->engine_data) +#define FILE_SET_ENG_DATA(f, data) ((f)->engine_data = (data)) #define FILE_FLAG_FNS(name) \ static inline void fio_file_set_##name(struct fio_file *f) \ @@ -212,5 +211,6 @@ extern void fio_file_reset(struct thread_data *, struct fio_file *); extern bool fio_files_done(struct thread_data *); extern bool exists_and_not_regfile(const char *); +extern int fio_set_directio(struct thread_data *, struct fio_file *); #endif diff -Nru fio-2.16/filesetup.c fio-3.1/filesetup.c --- fio-2.16/filesetup.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/filesetup.c 2017-09-28 10:23:20.000000000 +0000 @@ -24,18 +24,90 @@ static FLIST_HEAD(filename_list); +/* + * List entry for filename_list + */ +struct file_name { + struct flist_head list; + char *filename; +}; + static inline void clear_error(struct thread_data *td) { td->error = 0; td->verror[0] = '\0'; } +static inline int native_fallocate(struct thread_data *td, struct fio_file *f) +{ + bool success; + + success = fio_fallocate(f, 0, f->real_file_size); + dprint(FD_FILE, "native fallocate of file %s size %llu was " + "%ssuccessful\n", f->file_name, + (unsigned long long) f->real_file_size, + !success ? "un": ""); + + if (success) + return 0; + + if (errno == ENOSYS) + dprint(FD_FILE, "native fallocate is not implemented\n"); + + return -1; +} + +static void fallocate_file(struct thread_data *td, struct fio_file *f) +{ + int r; + + if (td->o.fill_device) + return; + + switch (td->o.fallocate_mode) { + case FIO_FALLOCATE_NATIVE: + r = native_fallocate(td, f); + if (r != 0 && errno != ENOSYS) + log_err("fio: native_fallocate call failed: %s\n", + strerror(errno)); + break; + case FIO_FALLOCATE_NONE: + break; +#ifdef CONFIG_POSIX_FALLOCATE + case FIO_FALLOCATE_POSIX: + dprint(FD_FILE, "posix_fallocate file %s size %llu\n", + f->file_name, + (unsigned long long) f->real_file_size); + + r = posix_fallocate(f->fd, 0, f->real_file_size); + if (r > 0) + log_err("fio: posix_fallocate fails: %s\n", strerror(r)); + break; +#endif /* CONFIG_POSIX_FALLOCATE */ +#ifdef CONFIG_LINUX_FALLOCATE + case FIO_FALLOCATE_KEEP_SIZE: + dprint(FD_FILE, "fallocate(FALLOC_FL_KEEP_SIZE) " + "file %s size %llu\n", f->file_name, + (unsigned long long) f->real_file_size); + + r = fallocate(f->fd, FALLOC_FL_KEEP_SIZE, 0, f->real_file_size); + if (r != 0) + td_verror(td, errno, "fallocate"); + + break; +#endif /* CONFIG_LINUX_FALLOCATE */ + default: + log_err("fio: unknown fallocate mode: %d\n", td->o.fallocate_mode); + assert(0); + } +} + /* * Leaves f->fd open on success, caller must close */ static int extend_file(struct thread_data *td, struct fio_file *f) { - int r, new_layout = 0, unlink_file = 0, flags; + int new_layout = 0, unlink_file = 0, flags; unsigned long long left; unsigned int bs; char *b = NULL; @@ -92,44 +164,11 @@ return 1; } -#ifdef CONFIG_POSIX_FALLOCATE - if (!td->o.fill_device) { - switch (td->o.fallocate_mode) { - case FIO_FALLOCATE_NONE: - break; - case FIO_FALLOCATE_POSIX: - dprint(FD_FILE, "posix_fallocate file %s size %llu\n", - f->file_name, - (unsigned long long) f->real_file_size); - - r = posix_fallocate(f->fd, 0, f->real_file_size); - if (r > 0) { - log_err("fio: posix_fallocate fails: %s\n", - strerror(r)); - } - break; -#ifdef CONFIG_LINUX_FALLOCATE - case FIO_FALLOCATE_KEEP_SIZE: - dprint(FD_FILE, - "fallocate(FALLOC_FL_KEEP_SIZE) " - "file %s size %llu\n", f->file_name, - (unsigned long long) f->real_file_size); - - r = fallocate(f->fd, FALLOC_FL_KEEP_SIZE, 0, - f->real_file_size); - if (r != 0) - td_verror(td, errno, "fallocate"); - - break; -#endif /* CONFIG_LINUX_FALLOCATE */ - default: - log_err("fio: unknown fallocate mode: %d\n", - td->o.fallocate_mode); - assert(0); - } - } -#endif /* CONFIG_POSIX_FALLOCATE */ + fallocate_file(td, f); + /* + * If our jobs don't require regular files initially, we're done. + */ if (!new_layout) goto done; @@ -148,11 +187,20 @@ } } - b = malloc(td->o.max_bs[DDIR_WRITE]); - left = f->real_file_size; + bs = td->o.max_bs[DDIR_WRITE]; + if (bs > left) + bs = left; + + b = malloc(bs); + if (!b) { + td_verror(td, errno, "malloc"); + goto err; + } + while (left && !td->terminate) { - bs = td->o.max_bs[DDIR_WRITE]; + ssize_t r; + if (bs > left) bs = left; @@ -217,7 +265,11 @@ unsigned int bs; char *b; - if (td_ioengine_flagged(td, FIO_PIPEIO)) + if (td_ioengine_flagged(td, FIO_PIPEIO) || + td_ioengine_flagged(td, FIO_NOIO)) + return 0; + + if (f->filetype == FIO_TYPE_CHAR) return 0; if (!fio_file_open(f)) { @@ -230,8 +282,17 @@ old_runstate = td_bump_runstate(td, TD_PRE_READING); + left = f->io_size; bs = td->o.max_bs[DDIR_READ]; + if (bs > left) + bs = left; + b = malloc(bs); + if (!b) { + td_verror(td, errno, "malloc"); + ret = 1; + goto error; + } memset(b, 0, bs); if (lseek(f->fd, f->file_offset, SEEK_SET) < 0) { @@ -241,8 +302,6 @@ goto error; } - left = f->io_size; - while (left && !td->terminate) { if (bs > left) bs = left; @@ -370,16 +429,38 @@ if (f->filetype == FIO_TYPE_FILE) ret = file_size(td, f); - else if (f->filetype == FIO_TYPE_BD) + else if (f->filetype == FIO_TYPE_BLOCK) ret = bdev_size(td, f); else if (f->filetype == FIO_TYPE_CHAR) ret = char_size(td, f); else - f->real_file_size = -1; + f->real_file_size = -1ULL; + /* + * Leave ->real_file_size with 0 since it could be expectation + * of initial setup for regular files. + */ if (ret) return ret; + /* + * If ->real_file_size is -1, a conditional for the message + * "offset extends end" is always true, but it makes no sense, + * so just return the same value here. + */ + if (f->real_file_size == -1ULL) { + log_info("%s: failed to get file size of %s\n", td->o.name, + f->file_name); + return 1; + } + + if (td->o.start_offset && f->file_offset == 0) + dprint(FD_FILE, "offset of file %s not initialized yet\n", + f->file_name); + /* + * ->file_offset normally hasn't been initialized yet, so this + * is basically always false. + */ if (f->file_offset > f->real_file_size) { log_err("%s: offset extends end (%llu > %llu)\n", td->o.name, (unsigned long long) f->file_offset, @@ -409,20 +490,22 @@ if (len == -1ULL || off == -1ULL) return 0; - dprint(FD_IO, "invalidate cache %s: %llu/%llu\n", f->file_name, off, - len); - if (td->io_ops->invalidate) { + dprint(FD_IO, "invalidate %s cache %s\n", td->io_ops->name, + f->file_name); ret = td->io_ops->invalidate(td, f); if (ret < 0) - errval = ret; + errval = -ret; } else if (f->filetype == FIO_TYPE_FILE) { + dprint(FD_IO, "declare unneeded cache %s: %llu/%llu\n", + f->file_name, off, len); ret = posix_fadvise(f->fd, off, len, POSIX_FADV_DONTNEED); if (ret) errval = ret; - } else if (f->filetype == FIO_TYPE_BD) { + } else if (f->filetype == FIO_TYPE_BLOCK) { int retry_count = 0; + dprint(FD_IO, "drop page cache %s\n", f->file_name); ret = blockdev_invalidate_cache(f); while (ret < 0 && errno == EAGAIN && retry_count++ < 25) { /* @@ -444,8 +527,11 @@ } if (ret < 0) errval = errno; - } else if (f->filetype == FIO_TYPE_CHAR || f->filetype == FIO_TYPE_PIPE) + } else if (f->filetype == FIO_TYPE_CHAR || + f->filetype == FIO_TYPE_PIPE) { + dprint(FD_IO, "invalidate not supported %s\n", f->file_name); ret = 0; + } /* * Cache flushing isn't a fatal condition, and we know it will @@ -454,7 +540,8 @@ * continue on our way. */ if (errval) - log_info("fio: cache invalidation of %s failed: %s\n", f->file_name, strerror(errval)); + log_info("fio: cache invalidation of %s failed: %s\n", + f->file_name, strerror(errval)); return 0; @@ -486,7 +573,7 @@ f->shadow_fd = -1; } - f->engine_data = 0; + f->engine_pos = 0; return ret; } @@ -498,9 +585,6 @@ __f = lookup_file_hash(f->file_name); if (__f) { dprint(FD_FILE, "found file in hash %s\n", f->file_name); - /* - * racy, need the __f->lock locked - */ f->lock = __f->lock; from_hash = 1; } else { @@ -597,7 +681,8 @@ f->fd = dup(STDIN_FILENO); else from_hash = file_lookup_open(f, flags); - } else { //td trim + } else if (td_trim(td)) { + assert(!td_rw(td)); /* should have matched above */ flags |= O_RDWR; from_hash = file_lookup_open(f, flags); } @@ -652,6 +737,10 @@ return 0; } +/* + * This function i.e. get_file_size() is the default .get_file_size + * implementation of majority of I/O engines. + */ int generic_get_file_size(struct thread_data *td, struct fio_file *f) { return get_file_size(td, f); @@ -667,7 +756,7 @@ int err = 0; for_each_file(td, f, i) { - dprint(FD_FILE, "get file size for %p/%d/%p\n", f, i, + dprint(FD_FILE, "get file size for %p/%d/%s\n", f, i, f->file_name); if (td_io_get_file_size(td, f)) { @@ -679,6 +768,13 @@ clear_error(td); } + /* + * There are corner cases where we end up with -1 for + * ->real_file_size due to unsupported file type, etc. + * We then just set to size option value divided by number + * of files, similar to the way file ->io_size is set. + * stat(2) failure doesn't set ->real_file_size to -1. + */ if (f->real_file_size == -1ULL && td->o.size) f->real_file_size = td->o.size / td->o.nr_files; } @@ -709,7 +805,7 @@ struct stat sb; char buf[256]; - if (f->filetype == FIO_TYPE_BD || f->filetype == FIO_TYPE_CHAR) { + if (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_CHAR) { if (f->real_file_size != -1ULL) ret += f->real_file_size; continue; @@ -765,12 +861,42 @@ uint64_t get_start_offset(struct thread_data *td, struct fio_file *f) { struct thread_options *o = &td->o; + unsigned long long align_bs; + unsigned long long offset; if (o->file_append && f->filetype == FIO_TYPE_FILE) return f->real_file_size; - return td->o.start_offset + - td->subjob_number * td->o.offset_increment; + if (o->start_offset_percent > 0) { + /* + * if blockalign is provided, find the min across read, write, + * and trim + */ + if (fio_option_is_set(o, ba)) { + align_bs = (unsigned long long) min(o->ba[DDIR_READ], o->ba[DDIR_WRITE]); + align_bs = min((unsigned long long) o->ba[DDIR_TRIM], align_bs); + } else { + /* else take the minimum block size */ + align_bs = td_min_bs(td); + } + + /* calculate the raw offset */ + offset = (f->real_file_size * o->start_offset_percent / 100) + + (td->subjob_number * o->offset_increment); + + /* + * block align the offset at the next available boundary at + * ceiling(offset / align_bs) * align_bs + */ + offset = (offset / align_bs + (offset % align_bs != 0)) * align_bs; + + } else { + /* start_offset_percent not set */ + offset = o->start_offset + + td->subjob_number * o->offset_increment; + } + + return offset; } /* @@ -795,7 +921,9 @@ goto done; /* - * if ioengine defines a setup() method, it's responsible for + * Find out physical size of files or devices for this thread, + * before we determine I/O size and range of our targets. + * If ioengine defines a setup() method, it's responsible for * opening the files and setting f->real_file_size to indicate * the valid range for that file. */ @@ -836,7 +964,7 @@ /* * Calculate per-file size and potential extra size for the - * first files, if needed. + * first files, if needed (i.e. if we don't have a fixed size). */ if (!o->file_size_low && o->nr_files) { uint64_t all_fs; @@ -858,11 +986,18 @@ for_each_file(td, f, i) { f->file_offset = get_start_offset(td, f); + /* + * Update ->io_size depending on options specified. + * ->file_size_low being 0 means filesize option isn't set. + * Non zero ->file_size_low equals ->file_size_high means + * filesize option is set in a fixed size format. + * Non zero ->file_size_low not equals ->file_size_high means + * filesize option is set in a range format. + */ if (!o->file_size_low) { /* - * no file size range given, file size is equal to - * total size divided by number of files. If that is - * zero, set it to the real file size. If the size + * no file size or range given, file size is equal to + * total size divided by number of files. If the size * doesn't divide nicely with the min blocksize, * make the first files bigger. */ @@ -872,8 +1007,22 @@ f->io_size += bs; } - if (!f->io_size) + /* + * We normally don't come here for regular files, but + * if the result is 0 for a regular file, set it to the + * real file size. This could be size of the existing + * one if it already exists, but otherwise will be set + * to 0. A new file won't be created because + * ->io_size + ->file_offset equals ->real_file_size. + */ + if (!f->io_size) { + if (f->file_offset > f->real_file_size) + goto err_offset; f->io_size = f->real_file_size - f->file_offset; + if (!f->io_size) + log_info("fio: file %s may be ignored\n", + f->file_name); + } } else if (f->real_file_size < o->file_size_low || f->real_file_size > o->file_size_high) { if (f->file_offset > o->file_size_low) @@ -895,7 +1044,14 @@ total_size = -1ULL; else { if (o->size_percent) { - f->io_size = (f->io_size * o->size_percent) / 100; + uint64_t file_size; + + file_size = f->io_size + f->file_offset; + f->io_size = (file_size * + o->size_percent) / 100; + if (f->io_size > (file_size - f->file_offset)) + f->io_size = file_size - f->file_offset; + f->io_size -= (f->io_size % td_min_bs(td)); } total_size += f->io_size; @@ -907,9 +1063,9 @@ if (!o->create_on_open) { need_extend++; extend_size += (f->io_size + f->file_offset); + fio_file_set_extend(f); } else f->real_file_size = f->io_size + f->file_offset; - fio_file_set_extend(f); } } @@ -943,14 +1099,21 @@ } /* - * See if we need to extend some files + * See if we need to extend some files, typically needed when our + * target regular files don't exist yet, but our jobs require them + * initially due to read I/Os. */ if (need_extend) { temp_stall_ts = 1; - if (output_format & FIO_OUTPUT_NORMAL) - log_info("%s: Laying out IO file(s) (%u file(s) /" - " %lluMB)\n", o->name, need_extend, - extend_size >> 20); + if (output_format & FIO_OUTPUT_NORMAL) { + log_info("%s: Laying out IO file%s (%u file%s / %s%lluMiB)\n", + o->name, + need_extend > 1 ? "s" : "", + need_extend, + need_extend > 1 ? "s" : "", + need_extend > 1 ? "total " : "", + extend_size >> 20); + } for_each_file(td, f, i) { unsigned long long old_len = -1ULL, extend_len = -1ULL; @@ -997,8 +1160,8 @@ * stored entries. */ if (!o->read_iolog_file) { - if (o->io_limit) - td->total_io_size = o->io_limit * o->loops; + if (o->io_size) + td->total_io_size = o->io_size * o->loops; else td->total_io_size = o->size * o->loops; } @@ -1024,10 +1187,11 @@ dprint(FD_FILE, "pre_read files\n"); for_each_file(td, f, i) { - pre_read_file(td, f); + if (pre_read_file(td, f)) + return -1; } - return 1; + return 0; } static int __init_rand_distribution(struct thread_data *td, struct fio_file *f) @@ -1229,12 +1393,12 @@ /* \\.\ is the device namespace in Windows, where every file is * a block device */ if (strncmp(f->file_name, "\\\\.\\", 4) == 0) - f->filetype = FIO_TYPE_BD; + f->filetype = FIO_TYPE_BLOCK; #endif if (!stat(f->file_name, &sb)) { if (S_ISBLK(sb.st_mode)) - f->filetype = FIO_TYPE_BD; + f->filetype = FIO_TYPE_BLOCK; else if (S_ISCHR(sb.st_mode)) f->filetype = FIO_TYPE_CHAR; else if (S_ISFIFO(sb.st_mode)) @@ -1679,3 +1843,32 @@ { free_already_allocated(); } + +/* + * This function is for platforms which support direct I/O but not O_DIRECT. + */ +int fio_set_directio(struct thread_data *td, struct fio_file *f) +{ +#ifdef FIO_OS_DIRECTIO + int ret = fio_set_odirect(f); + + if (ret) { + td_verror(td, ret, "fio_set_directio"); +#if defined(__sun__) + if (ret == ENOTTY) { /* ENOTTY suggests RAW device or ZFS */ + log_err("fio: doing directIO to RAW devices or ZFS not supported\n"); + } else { + log_err("fio: the file system does not seem to support direct IO\n"); + } +#else + log_err("fio: the file system does not seem to support direct IO\n"); +#endif + return -1; + } + + return 0; +#else + log_err("fio: direct IO is not supported on this host operating system\n"); + return -1; +#endif +} diff -Nru fio-2.16/fio.1 fio-3.1/fio.1 --- fio-2.16/fio.1 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/fio.1 2017-09-28 10:23:20.000000000 +0000 @@ -1,4 +1,4 @@ -.TH fio 1 "December 2014" "User Manual" +.TH fio 1 "August 2017" "User Manual" .SH NAME fio \- flexible I/O tester .SH SYNOPSIS @@ -13,217 +13,549 @@ .SH OPTIONS .TP .BI \-\-debug \fR=\fPtype -Enable verbose tracing of various fio actions. May be `all' for all types -or individual types separated by a comma (eg \-\-debug=io,file). `help' will -list all available tracing options. +Enable verbose tracing \fItype\fR of various fio actions. May be `all' for all \fItype\fRs +or individual types separated by a comma (e.g. `\-\-debug=file,mem' will enable +file and memory debugging). `help' will list all available tracing options. +.TP +.BI \-\-parse\-only +Parse options only, don't start any I/O. .TP .BI \-\-output \fR=\fPfilename Write output to \fIfilename\fR. .TP -.BI \-\-output-format \fR=\fPformat -Set the reporting format to \fInormal\fR, \fIterse\fR, \fIjson\fR, or -\fIjson+\fR. Multiple formats can be selected, separate by a comma. \fIterse\fR -is a CSV based format. \fIjson+\fR is like \fIjson\fR, except it adds a full +.BI \-\-output\-format \fR=\fPformat +Set the reporting \fIformat\fR to `normal', `terse', `json', or +`json+'. Multiple formats can be selected, separate by a comma. `terse' +is a CSV based format. `json+' is like `json', except it adds a full dump of the latency buckets. .TP -.BI \-\-runtime \fR=\fPruntime -Limit run time to \fIruntime\fR seconds. -.TP -.B \-\-bandwidth\-log +.BI \-\-bandwidth\-log Generate aggregate bandwidth logs. .TP -.B \-\-minimal -Print statistics in a terse, semicolon-delimited format. +.BI \-\-minimal +Print statistics in a terse, semicolon\-delimited format. .TP -.B \-\-append-terse -Print statistics in selected mode AND terse, semicolon-delimited format. -Deprecated, use \-\-output-format instead to select multiple formats. -.TP -.B \-\-version -Display version information and exit. +.BI \-\-append\-terse +Print statistics in selected mode AND terse, semicolon\-delimited format. +\fBDeprecated\fR, use \fB\-\-output\-format\fR instead to select multiple formats. .TP .BI \-\-terse\-version \fR=\fPversion -Set terse version output format (Current version 3, or older version 2). +Set terse \fIversion\fR output format (default `3', or `2', `4', `5'). +.TP +.BI \-\-version +Print version information and exit. .TP -.B \-\-help -Display usage information and exit. +.BI \-\-help +Print a summary of the command line options and exit. .TP -.B \-\-cpuclock-test -Perform test and validation of internal CPU clock +.BI \-\-cpuclock\-test +Perform test and validation of internal CPU clock. .TP -.BI \-\-crctest[\fR=\fPtest] -Test the speed of the builtin checksumming functions. If no argument is given, -all of them are tested. Or a comma separated list can be passed, in which +.BI \-\-crctest \fR=\fP[test] +Test the speed of the built\-in checksumming functions. If no argument is given, +all of them are tested. Alternatively, a comma separated list can be passed, in which case the given ones are tested. .TP .BI \-\-cmdhelp \fR=\fPcommand -Print help information for \fIcommand\fR. May be `all' for all commands. +Print help information for \fIcommand\fR. May be `all' for all commands. .TP -.BI \-\-enghelp \fR=\fPioengine[,command] -List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR defined by \fIioengine\fR. +.BI \-\-enghelp \fR=\fP[ioengine[,command]] +List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR +defined by \fIioengine\fR. If no \fIioengine\fR is given, list all +available ioengines. .TP .BI \-\-showcmd \fR=\fPjobfile -Convert \fIjobfile\fR to a set of command-line options. +Convert \fIjobfile\fR to a set of command\-line options. +.TP +.BI \-\-readonly +Turn on safety read\-only checks, preventing writes. The \fB\-\-readonly\fR +option is an extra safety guard to prevent users from accidentally starting +a write workload when that is not desired. Fio will only write if +`rw=write/randwrite/rw/randrw' is given. This extra safety net can be used +as an extra precaution as \fB\-\-readonly\fR will also enable a write check in +the I/O engine core to prevent writes due to unknown user space bug(s). .TP .BI \-\-eta \fR=\fPwhen -Specifies when real-time ETA estimate should be printed. \fIwhen\fR may -be one of `always', `never' or `auto'. +Specifies when real\-time ETA estimate should be printed. \fIwhen\fR may +be `always', `never' or `auto'. .TP .BI \-\-eta\-newline \fR=\fPtime -Force an ETA newline for every `time` period passed. +Force a new line for every \fItime\fR period passed. When the unit is omitted, +the value is interpreted in seconds. .TP .BI \-\-status\-interval \fR=\fPtime -Report full output status every `time` period passed. -.TP -.BI \-\-readonly -Turn on safety read-only checks, preventing any attempted write. -.TP -.BI \-\-section \fR=\fPsec -Only run section \fIsec\fR from job file. This option can be used multiple times to add more sections to run. +Force a full status dump of cumulative (from job start) values at \fItime\fR +intervals. This option does *not* provide per-period measurements. So +values such as bandwidth are running averages. When the time unit is omitted, +\fItime\fR is interpreted in seconds. +.TP +.BI \-\-section \fR=\fPname +Only run specified section \fIname\fR in job file. Multiple sections can be specified. +The \fB\-\-section\fR option allows one to combine related jobs into one file. +E.g. one job file could define light, moderate, and heavy sections. Tell +fio to run only the "heavy" section by giving `\-\-section=heavy' +command line option. One can also specify the "write" operations in one +section and "verify" operation in another section. The \fB\-\-section\fR option +only applies to job sections. The reserved *global* section is always +parsed and used. .TP .BI \-\-alloc\-size \fR=\fPkb -Set the internal smalloc pool size to \fIkb\fP kilobytes. +Set the internal smalloc pool size to \fIkb\fR in KiB. The +\fB\-\-alloc\-size\fR switch allows one to use a larger pool size for smalloc. +If running large jobs with randommap enabled, fio can run out of memory. +Smalloc is an internal allocator for shared structures from a fixed size +memory pool and can grow to 16 pools. The pool size defaults to 16MiB. +NOTE: While running `.fio_smalloc.*' backing store files are visible +in `/tmp'. .TP .BI \-\-warnings\-fatal All fio parser warnings are fatal, causing fio to exit with an error. .TP .BI \-\-max\-jobs \fR=\fPnr -Set the maximum allowed number of jobs (threads/processes) to support. +Set the maximum number of threads/processes to support to \fInr\fR. .TP .BI \-\-server \fR=\fPargs -Start a backend server, with \fIargs\fP specifying what to listen to. See client/server section. +Start a backend server, with \fIargs\fR specifying what to listen to. +See \fBCLIENT/SERVER\fR section. .TP .BI \-\-daemonize \fR=\fPpidfile -Background a fio server, writing the pid to the given pid file. +Background a fio server, writing the pid to the given \fIpidfile\fR file. +.TP +.BI \-\-client \fR=\fPhostname +Instead of running the jobs locally, send and run them on the given \fIhostname\fR +or set of \fIhostname\fRs. See \fBCLIENT/SERVER\fR section. .TP -.BI \-\-client \fR=\fPhost -Instead of running the jobs locally, send and run them on the given host or set of hosts. See client/server section. +.BI \-\-remote\-config \fR=\fPfile +Tell fio server to load this local \fIfile\fR. .TP .BI \-\-idle\-prof \fR=\fPoption -Report cpu idleness on a system or percpu basis (\fIoption\fP=system,percpu) or run unit work calibration only (\fIoption\fP=calibrate). -.SH "JOB FILE FORMAT" -Job files are in `ini' format. They consist of one or more -job definitions, which begin with a job name in square brackets and -extend to the next job name. The job name can be any ASCII string -except `global', which has a special meaning. Following the job name is -a sequence of zero or more parameters, one per line, that define the -behavior of the job. Any line starting with a `;' or `#' character is -considered a comment and ignored. -.P -If \fIjobfile\fR is specified as `-', the job file will be read from -standard input. -.SS "Global Section" -The global section contains default parameters for jobs specified in the -job file. A job is only affected by global sections residing above it, -and there may be any number of global sections. Specific job definitions -may override any parameter set in global sections. -.SH "JOB PARAMETERS" -.SS Types -Some parameters may take arguments of a specific type. -Anywhere a numeric value is required, an arithmetic expression may be used, -provided it is surrounded by parentheses. Supported operators are: +Report CPU idleness. \fIoption\fR is one of the following: .RS .RS .TP -.B addition (+) +.B calibrate +Run unit work calibration only and exit. .TP -.B subtraction (-) +.B system +Show aggregate system idleness and unit work. .TP -.B multiplication (*) +.B percpu +As \fBsystem\fR but also show per CPU idleness. +.RE +.RE .TP -.B division (/) +.BI \-\-inflate\-log \fR=\fPlog +Inflate and output compressed \fIlog\fR. .TP -.B modulus (%) +.BI \-\-trigger\-file \fR=\fPfile +Execute trigger command when \fIfile\fR exists. +.TP +.BI \-\-trigger\-timeout \fR=\fPtime +Execute trigger at this \fItime\fR. +.TP +.BI \-\-trigger \fR=\fPcommand +Set this \fIcommand\fR as local trigger. .TP +.BI \-\-trigger\-remote \fR=\fPcommand +Set this \fIcommand\fR as remote trigger. +.TP +.BI \-\-aux\-path \fR=\fPpath +Use this \fIpath\fR for fio state generated files. +.SH "JOB FILE FORMAT" +Any parameters following the options will be assumed to be job files, unless +they match a job file parameter. Multiple job files can be listed and each job +file will be regarded as a separate group. Fio will \fBstonewall\fR execution +between each group. + +Fio accepts one or more job files describing what it is +supposed to do. The job file format is the classic ini file, where the names +enclosed in [] brackets define the job name. You are free to use any ASCII name +you want, except *global* which has special meaning. Following the job name is +a sequence of zero or more parameters, one per line, that define the behavior of +the job. If the first character in a line is a ';' or a '#', the entire line is +discarded as a comment. + +A *global* section sets defaults for the jobs described in that file. A job may +override a *global* section parameter, and a job file may even have several +*global* sections if so desired. A job is only affected by a *global* section +residing above it. + +The \fB\-\-cmdhelp\fR option also lists all options. If used with an \fIcommand\fR +argument, \fB\-\-cmdhelp\fR will detail the given \fIcommand\fR. + +See the `examples/' directory for inspiration on how to write job files. Note +the copyright and license requirements currently apply to +`examples/' files. +.SH "JOB FILE PARAMETERS" +Some parameters take an option of a given type, such as an integer or a +string. Anywhere a numeric value is required, an arithmetic expression may be +used, provided it is surrounded by parentheses. Supported operators are: +.RS +.P +.B addition (+) +.P +.B subtraction (\-) +.P +.B multiplication (*) +.P +.B division (/) +.P +.B modulus (%) +.P .B exponentiation (^) .RE -.RE .P For time values in expressions, units are microseconds by default. This is different than for time values not in expressions (not enclosed in -parentheses). The types used are: +parentheses). +.SH "PARAMETER TYPES" +The following parameter types are used. .TP .I str -String: a sequence of alphanumeric characters. +String. A sequence of alphanumeric characters. +.TP +.I time +Integer with possible time suffix. Without a unit value is interpreted as +seconds unless otherwise specified. Accepts a suffix of 'd' for days, 'h' for +hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and 'us' +(or 'usec') for microseconds. For example, use 10m for 10 minutes. .TP .I int -SI integer: a whole number, possibly containing a suffix denoting the base unit -of the value. Accepted suffixes are `k', 'M', 'G', 'T', and 'P', denoting -kilo (1024), mega (1024^2), giga (1024^3), tera (1024^4), and peta (1024^5) -respectively. If prefixed with '0x', the value is assumed to be base 16 -(hexadecimal). A suffix may include a trailing 'b', for instance 'kb' is -identical to 'k'. You can specify a base 10 value by using 'KiB', 'MiB','GiB', -etc. This is useful for disk drives where values are often given in base 10 -values. Specifying '30GiB' will get you 30*1000^3 bytes. -When specifying times the default suffix meaning changes, still denoting the -base unit of the value, but accepted suffixes are 'D' (days), 'H' (hours), 'M' -(minutes), 'S' Seconds, 'ms' (or msec) milli seconds, 'us' (or 'usec') micro -seconds. Time values without a unit specify seconds. -The suffixes are not case sensitive. +Integer. A whole number value, which may contain an integer prefix +and an integer suffix. +.RS +.RS +.P +[*integer prefix*] **number** [*integer suffix*] +.RE +.P +The optional *integer prefix* specifies the number's base. The default +is decimal. *0x* specifies hexadecimal. +.P +The optional *integer suffix* specifies the number's units, and includes an +optional unit prefix and an optional unit. For quantities of data, the +default unit is bytes. For quantities of time, the default unit is seconds +unless otherwise specified. +.P +With `kb_base=1000', fio follows international standards for unit +prefixes. To specify power\-of\-10 decimal values defined in the +International System of Units (SI): +.RS +.P +.PD 0 +K means kilo (K) or 1000 +.P +M means mega (M) or 1000**2 +.P +G means giga (G) or 1000**3 +.P +T means tera (T) or 1000**4 +.P +P means peta (P) or 1000**5 +.PD +.RE +.P +To specify power\-of\-2 binary values defined in IEC 80000\-13: +.RS +.P +.PD 0 +Ki means kibi (Ki) or 1024 +.P +Mi means mebi (Mi) or 1024**2 +.P +Gi means gibi (Gi) or 1024**3 +.P +Ti means tebi (Ti) or 1024**4 +.P +Pi means pebi (Pi) or 1024**5 +.PD +.RE +.P +With `kb_base=1024' (the default), the unit prefixes are opposite +from those specified in the SI and IEC 80000\-13 standards to provide +compatibility with old scripts. For example, 4k means 4096. +.P +For quantities of data, an optional unit of 'B' may be included +(e.g., 'kB' is the same as 'k'). +.P +The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega, +not milli). 'b' and 'B' both mean byte, not bit. +.P +Examples with `kb_base=1000': +.RS +.P +.PD 0 +4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB +.P +1 MiB: 1048576, 1m, 1024k +.P +1 MB: 1000000, 1mi, 1000ki +.P +1 TiB: 1073741824, 1t, 1024m, 1048576k +.P +1 TB: 1000000000, 1ti, 1000mi, 1000000ki +.PD +.RE +.P +Examples with `kb_base=1024' (default): +.RS +.P +.PD 0 +4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB +.P +1 MiB: 1048576, 1m, 1024k +.P +1 MB: 1000000, 1mi, 1000ki +.P +1 TiB: 1073741824, 1t, 1024m, 1048576k +.P +1 TB: 1000000000, 1ti, 1000mi, 1000000ki +.PD +.RE +.P +To specify times (units are not case sensitive): +.RS +.P +.PD 0 +D means days +.P +H means hours +.P +M mean minutes +.P +s or sec means seconds (default) +.P +ms or msec means milliseconds +.P +us or usec means microseconds +.PD +.RE +.P +If the option accepts an upper and lower range, use a colon ':' or +minus '\-' to separate such values. See \fIirange\fR parameter type. +If the lower value specified happens to be larger than the upper value +the two values are swapped. +.RE .TP .I bool -Boolean: a true or false value. `0' denotes false, `1' denotes true. +Boolean. Usually parsed as an integer, however only defined for +true and false (1 and 0). .TP .I irange -Integer range: a range of integers specified in the format -\fIlower\fR:\fIupper\fR or \fIlower\fR\-\fIupper\fR. \fIlower\fR and -\fIupper\fR may contain a suffix as described above. If an option allows two -sets of ranges, they are separated with a `,' or `/' character. For example: -`8\-8k/8M\-4G'. +Integer range with suffix. Allows value range to be given, such as +1024\-4096. A colon may also be used as the separator, e.g. 1k:4k. If the +option allows two sets of ranges, they can be specified with a ',' or '/' +delimiter: 1k\-4k/8k\-32k. Also see \fIint\fR parameter type. .TP .I float_list -List of floating numbers: A list of floating numbers, separated by -a ':' character. -.SS "Parameter List" +A list of floating point numbers, separated by a ':' character. +.SH "JOB PARAMETERS" +With the above in mind, here follows the complete list of fio job parameters. +.SS "Units" .TP -.BI name \fR=\fPstr -May be used to override the job name. On the command line, this parameter -has the special purpose of signalling the start of a new job. +.BI kb_base \fR=\fPint +Select the interpretation of unit prefixes in input parameters. +.RS +.RS .TP -.BI wait_for \fR=\fPstr -Specifies the name of the already defined job to wait for. Single waitee name -only may be specified. If set, the job won't be started until all workers of -the waitee job are done. Wait_for operates on the job name basis, so there are -a few limitations. First, the waitee must be defined prior to the waiter job -(meaning no forward references). Second, if a job is being referenced as a -waitee, it must have a unique name (no duplicate waitees). +.B 1000 +Inputs comply with IEC 80000\-13 and the International +System of Units (SI). Use: +.RS +.P +.PD 0 +\- power\-of\-2 values with IEC prefixes (e.g., KiB) +.P +\- power\-of\-10 values with SI prefixes (e.g., kB) +.PD +.RE +.TP +.B 1024 +Compatibility mode (default). To avoid breaking old scripts: +.P +.RS +.PD 0 +\- power\-of\-2 values with SI prefixes +.P +\- power\-of\-10 values with IEC prefixes +.PD +.RE +.RE +.P +See \fBbs\fR for more details on input parameters. +.P +Outputs always use correct prefixes. Most outputs include both +side\-by\-side, like: +.P +.RS +bw=2383.3kB/s (2327.4KiB/s) +.RE +.P +If only one value is reported, then kb_base selects the one to use: +.P +.RS +.PD 0 +1000 \-\- SI prefixes +.P +1024 \-\- IEC prefixes +.PD +.RE +.RE +.TP +.BI unit_base \fR=\fPint +Base unit for reporting. Allowed values are: +.RS +.RS +.TP +.B 0 +Use auto\-detection (default). +.TP +.B 8 +Byte based. +.TP +.B 1 +Bit based. +.RE +.RE +.SS "Job description" +.TP +.BI name \fR=\fPstr +ASCII name of the job. This may be used to override the name printed by fio +for this job. Otherwise the job name is used. On the command line this +parameter has the special purpose of also signaling the start of a new job. .TP .BI description \fR=\fPstr -Human-readable description of the job. It is printed when the job is run, but -otherwise has no special purpose. +Text description of the job. Doesn't do anything except dump this text +description when this job is run. It's not parsed. +.TP +.BI loops \fR=\fPint +Run the specified number of iterations of this job. Used to repeat the same +workload a given number of times. Defaults to 1. +.TP +.BI numjobs \fR=\fPint +Create the specified number of clones of this job. Each clone of job +is spawned as an independent thread or process. May be used to setup a +larger number of threads/processes doing the same thing. Each thread is +reported separately; to see statistics for all clones as a whole, use +\fBgroup_reporting\fR in conjunction with \fBnew_group\fR. +See \fB\-\-max\-jobs\fR. Default: 1. +.SS "Time related parameters" +.TP +.BI runtime \fR=\fPtime +Tell fio to terminate processing after the specified period of time. It +can be quite hard to determine for how long a specified job will run, so +this parameter is handy to cap the total runtime to a given time. When +the unit is omitted, the value is intepreted in seconds. +.TP +.BI time_based +If set, fio will run for the duration of the \fBruntime\fR specified +even if the file(s) are completely read or written. It will simply loop over +the same workload as many times as the \fBruntime\fR allows. +.TP +.BI startdelay \fR=\fPirange(int) +Delay the start of job for the specified amount of time. Can be a single +value or a range. When given as a range, each thread will choose a value +randomly from within the range. Value is in seconds if a unit is omitted. +.TP +.BI ramp_time \fR=\fPtime +If set, fio will run the specified workload for this amount of time before +logging any performance numbers. Useful for letting performance settle +before logging results, thus minimizing the runtime required for stable +results. Note that the \fBramp_time\fR is considered lead in time for a job, +thus it will increase the total runtime if a special timeout or +\fBruntime\fR is specified. When the unit is omitted, the value is +given in seconds. +.TP +.BI clocksource \fR=\fPstr +Use the given clocksource as the base of timing. The supported options are: +.RS +.RS +.TP +.B gettimeofday +\fBgettimeofday\fR\|(2) +.TP +.B clock_gettime +\fBclock_gettime\fR\|(2) +.TP +.B cpu +Internal CPU clock source +.RE +.P +\fBcpu\fR is the preferred clocksource if it is reliable, as it is very fast (and +fio is heavy on time calls). Fio will automatically use this clocksource if +it's supported and considered reliable on the system it is running on, +unless another clocksource is specifically set. For x86/x86\-64 CPUs, this +means supporting TSC Invariant. +.RE +.TP +.BI gtod_reduce \fR=\fPbool +Enable all of the \fBgettimeofday\fR\|(2) reducing options +(\fBdisable_clat\fR, \fBdisable_slat\fR, \fBdisable_bw_measurement\fR) plus +reduce precision of the timeout somewhat to really shrink the +\fBgettimeofday\fR\|(2) call count. With this option enabled, we only do +about 0.4% of the \fBgettimeofday\fR\|(2) calls we would have done if all +time keeping was enabled. +.TP +.BI gtod_cpu \fR=\fPint +Sometimes it's cheaper to dedicate a single thread of execution to just +getting the current time. Fio (and databases, for instance) are very +intensive on \fBgettimeofday\fR\|(2) calls. With this option, you can set +one CPU aside for doing nothing but logging current time to a shared memory +location. Then the other threads/processes that run I/O workloads need only +copy that segment, instead of entering the kernel with a +\fBgettimeofday\fR\|(2) call. The CPU set aside for doing these time +calls will be excluded from other uses. Fio will manually clear it from the +CPU mask of other jobs. +.SS "Target file/device" .TP .BI directory \fR=\fPstr -Prefix filenames with this directory. Used to place files in a location other -than `./'. -You can specify a number of directories by separating the names with a ':' -character. These directories will be assigned equally distributed to job clones -creates with \fInumjobs\fR as long as they are using generated filenames. -If specific \fIfilename(s)\fR are set fio will use the first listed directory, -and thereby matching the \fIfilename\fR semantic which generates a file each -clone if not specified, but let all clones use the same if set. See -\fIfilename\fR for considerations regarding escaping certain characters on -some platforms. +Prefix \fBfilename\fRs with this directory. Used to place files in a different +location than `./'. You can specify a number of directories by +separating the names with a ':' character. These directories will be +assigned equally distributed to job clones created by \fBnumjobs\fR as +long as they are using generated filenames. If specific \fBfilename\fR(s) are +set fio will use the first listed directory, and thereby matching the +\fBfilename\fR semantic which generates a file each clone if not specified, but +let all clones use the same if set. +.RS +.P +See the \fBfilename\fR option for information on how to escape ':' and '\' +characters within the directory path itself. +.RE .TP .BI filename \fR=\fPstr -.B fio -normally makes up a file name based on the job name, thread number, and file -number. If you want to share files between threads in a job or several jobs, -specify a \fIfilename\fR for each of them to override the default. -If the I/O engine is file-based, you can specify -a number of files by separating the names with a `:' character. `\-' is a -reserved name, meaning stdin or stdout, depending on the read/write direction -set. On Windows, disk devices are accessed as \\.\PhysicalDrive0 for the first -device, \\.\PhysicalDrive1 for the second etc. Note: Windows and FreeBSD -prevent write access to areas of the disk containing in-use data -(e.g. filesystems). If the wanted filename does need to include a colon, then -escape that with a '\\' character. For instance, if the filename is -"/dev/dsk/foo@3,0:c", then you would use filename="/dev/dsk/foo@3,0\\:c". +Fio normally makes up a \fBfilename\fR based on the job name, thread number, and +file number (see \fBfilename_format\fR). If you want to share files +between threads in a job or several +jobs with fixed file paths, specify a \fBfilename\fR for each of them to override +the default. If the ioengine is file based, you can specify a number of files +by separating the names with a ':' colon. So if you wanted a job to open +`/dev/sda' and `/dev/sdb' as the two working files, you would use +`filename=/dev/sda:/dev/sdb'. This also means that whenever this option is +specified, \fBnrfiles\fR is ignored. The size of regular files specified +by this option will be \fBsize\fR divided by number of files unless an +explicit size is specified by \fBfilesize\fR. +.RS +.P +Each colon and backslash in the wanted path must be escaped with a '\' +character. For instance, if the path is `/dev/dsk/foo@3,0:c' then you +would use `filename=/dev/dsk/foo@3,0\\:c' and if the path is +`F:\\\\filename' then you would use `filename=F\\:\\\\filename'. +.P +On Windows, disk devices are accessed as `\\\\\\\\.\\\\PhysicalDrive0' for +the first device, `\\\\\\\\.\\\\PhysicalDrive1' for the second etc. +Note: Windows and FreeBSD prevent write access to areas +of the disk containing in\-use data (e.g. filesystems). +.P +The filename `\-' is a reserved name, meaning *stdin* or *stdout*. Which +of the two depends on the read/write direction set. +.RE .TP .BI filename_format \fR=\fPstr -If sharing multiple files between jobs, it is usually necessary to have -fio generate the exact names that you want. By default, fio will name a file +If sharing multiple files between jobs, it is usually necessary to have fio +generate the exact names that you want. By default, fio will name a file based on the default file format specification of -\fBjobname.jobnumber.filenumber\fP. With this option, that can be +`jobname.jobnumber.filenumber'. With this option, that can be customized. Fio will recognize and replace the following keywords in this string: .RS @@ -239,44 +571,168 @@ The incremental number of the file for that worker thread or process. .RE .P -To have dependent jobs share a set of files, this option can be set to -have fio generate filenames that are shared between the two. For instance, -if \fBtestfiles.$filenum\fR is specified, file number 4 for any job will -be named \fBtestfiles.4\fR. The default of \fB$jobname.$jobnum.$filenum\fR +To have dependent jobs share a set of files, this option can be set to have +fio generate filenames that are shared between the two. For instance, if +`testfiles.$filenum' is specified, file number 4 for any job will be +named `testfiles.4'. The default of `$jobname.$jobnum.$filenum' will be used if no other format specifier is given. .RE -.P .TP .BI unique_filename \fR=\fPbool -To avoid collisions between networked clients, fio defaults to prefixing -any generated filenames (with a directory specified) with the source of -the client connecting. To disable this behavior, set this option to 0. +To avoid collisions between networked clients, fio defaults to prefixing any +generated filenames (with a directory specified) with the source of the +client connecting. To disable this behavior, set this option to 0. +.TP +.BI opendir \fR=\fPstr +Recursively open any files below directory \fIstr\fR. .TP .BI lockfile \fR=\fPstr -Fio defaults to not locking any files before it does IO to them. If a file or -file descriptor is shared, fio can serialize IO to that file to make the end -result consistent. This is usual for emulating real workloads that share files. -The lock modes are: +Fio defaults to not locking any files before it does I/O to them. If a file +or file descriptor is shared, fio can serialize I/O to that file to make the +end result consistent. This is usual for emulating real workloads that share +files. The lock modes are: .RS .RS .TP .B none -No locking. This is the default. +No locking. The default. .TP .B exclusive -Only one thread or process may do IO at a time, excluding all others. +Only one thread or process may do I/O at a time, excluding all others. .TP .B readwrite -Read-write locking on the file. Many readers may access the file at the same -time, but writes get exclusive access. +Read\-write locking on the file. Many readers may +access the file at the same time, but writes get exclusive access. .RE .RE +.TP +.BI nrfiles \fR=\fPint +Number of files to use for this job. Defaults to 1. The size of files +will be \fBsize\fR divided by this unless explicit size is specified by +\fBfilesize\fR. Files are created for each thread separately, and each +file will have a file number within its name by default, as explained in +\fBfilename\fR section. +.TP +.BI openfiles \fR=\fPint +Number of files to keep open at the same time. Defaults to the same as +\fBnrfiles\fR, can be set smaller to limit the number simultaneous +opens. +.TP +.BI file_service_type \fR=\fPstr +Defines how fio decides which file from a job to service next. The following +types are defined: +.RS +.RS +.TP +.B random +Choose a file at random. +.TP +.B roundrobin +Round robin over opened files. This is the default. +.TP +.B sequential +Finish one file before moving on to the next. Multiple files can +still be open depending on \fBopenfiles\fR. +.TP +.B zipf +Use a Zipf distribution to decide what file to access. +.TP +.B pareto +Use a Pareto distribution to decide what file to access. +.TP +.B normal +Use a Gaussian (normal) distribution to decide what file to access. +.TP +.B gauss +Alias for normal. +.RE .P -.BI opendir \fR=\fPstr -Recursively open any files below directory \fIstr\fR. +For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be appended to +tell fio how many I/Os to issue before switching to a new file. For example, +specifying `file_service_type=random:8' would cause fio to issue +8 I/Os before selecting a new file at random. For the non\-uniform +distributions, a floating point postfix can be given to influence how the +distribution is skewed. See \fBrandom_distribution\fR for a description +of how that would work. +.RE +.TP +.BI ioscheduler \fR=\fPstr +Attempt to switch the device hosting the file to the specified I/O scheduler +before running. +.TP +.BI create_serialize \fR=\fPbool +If true, serialize the file creation for the jobs. This may be handy to +avoid interleaving of data files, which may greatly depend on the filesystem +used and even the number of processors in the system. Default: true. +.TP +.BI create_fsync \fR=\fPbool +\fBfsync\fR\|(2) the data file after creation. This is the default. +.TP +.BI create_on_open \fR=\fPbool +If true, don't pre\-create files but allow the job's open() to create a file +when it's time to do I/O. Default: false \-\- pre\-create all necessary files +when the job starts. +.TP +.BI create_only \fR=\fPbool +If true, fio will only run the setup phase of the job. If files need to be +laid out or updated on disk, only that will be done \-\- the actual job contents +are not executed. Default: false. +.TP +.BI allow_file_create \fR=\fPbool +If true, fio is permitted to create files as part of its workload. If this +option is false, then fio will error out if +the files it needs to use don't already exist. Default: true. +.TP +.BI allow_mounted_write \fR=\fPbool +If this isn't set, fio will abort jobs that are destructive (e.g. that write) +to what appears to be a mounted device or partition. This should help catch +creating inadvertently destructive tests, not realizing that the test will +destroy data on the mounted file system. Note that some platforms don't allow +writing against a mounted device regardless of this option. Default: false. +.TP +.BI pre_read \fR=\fPbool +If this is given, files will be pre\-read into memory before starting the +given I/O operation. This will also clear the \fBinvalidate\fR flag, +since it is pointless to pre\-read and then drop the cache. This will only +work for I/O engines that are seek\-able, since they allow you to read the +same data multiple times. Thus it will not work on non\-seekable I/O engines +(e.g. network, splice). Default: false. +.TP +.BI unlink \fR=\fPbool +Unlink the job files when done. Not the default, as repeated runs of that +job would then waste time recreating the file set again and again. Default: +false. +.TP +.BI unlink_each_loop \fR=\fPbool +Unlink job files after each iteration or loop. Default: false. +.TP +.BI zonesize \fR=\fPint +Divide a file into zones of the specified size. See \fBzoneskip\fR. +.TP +.BI zonerange \fR=\fPint +Give size of an I/O zone. See \fBzoneskip\fR. +.TP +.BI zoneskip \fR=\fPint +Skip the specified number of bytes when \fBzonesize\fR data has been +read. The two zone options can be used to only do I/O on zones of a file. +.SS "I/O type" +.TP +.BI direct \fR=\fPbool +If value is true, use non\-buffered I/O. This is usually O_DIRECT. Note that +OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous +ioengines don't support direct I/O. Default: false. +.TP +.BI atomic \fR=\fPbool +If value is true, attempt to use atomic direct I/O. Atomic writes are +guaranteed to be stable once acknowledged by the operating system. Only +Linux supports O_ATOMIC right now. +.TP +.BI buffered \fR=\fPbool +If value is true, use buffered I/O. This is the opposite of the +\fBdirect\fR option. Defaults to true. .TP .BI readwrite \fR=\fPstr "\fR,\fP rw" \fR=\fPstr -Type of I/O pattern. Accepted values are: +Type of I/O pattern. Accepted values are: .RS .RS .TP @@ -287,7 +743,7 @@ Sequential writes. .TP .B trim -Sequential trim (Linux block devices only). +Sequential trims (Linux block devices only). .TP .B randread Random reads. @@ -296,73 +752,69 @@ Random writes. .TP .B randtrim -Random trim (Linux block devices only). +Random trims (Linux block devices only). .TP -.B rw, readwrite -Mixed sequential reads and writes. +.B rw,readwrite +Sequential mixed reads and writes. .TP .B randrw -Mixed random reads and writes. +Random mixed reads and writes. .TP .B trimwrite -Trim and write mixed workload. Blocks will be trimmed first, then the same -blocks will be written to. +Sequential trim+write sequences. Blocks will be trimmed first, +then the same blocks will be written to. .RE .P -Fio defaults to read if the option is not specified. -For mixed I/O, the default split is 50/50. For certain types of io the result -may still be skewed a bit, since the speed may be different. It is possible to -specify a number of IO's to do before getting a new offset, this is done by -appending a `:\fI\fR to the end of the string given. For a random read, it -would look like \fBrw=randread:8\fR for passing in an offset modifier with a -value of 8. If the postfix is used with a sequential IO pattern, then the value -specified will be added to the generated offset for each IO. For instance, -using \fBrw=write:4k\fR will skip 4k for every write. It turns sequential IO -into sequential IO with holes. See the \fBrw_sequencer\fR option. +Fio defaults to read if the option is not specified. For the mixed I/O +types, the default is to split them 50/50. For certain types of I/O the +result may still be skewed a bit, since the speed may be different. +.P +It is possible to specify the number of I/Os to do before getting a new +offset by appending `:' to the end of the string given. For a +random read, it would look like `rw=randread:8' for passing in an offset +modifier with a value of 8. If the suffix is used with a sequential I/O +pattern, then the `' value specified will be added to the generated +offset for each I/O turning sequential I/O into sequential I/O with holes. +For instance, using `rw=write:4k' will skip 4k for every write. Also see +the \fBrw_sequencer\fR option. .RE .TP .BI rw_sequencer \fR=\fPstr -If an offset modifier is given by appending a number to the \fBrw=\fR line, -then this option controls how that number modifies the IO offset being -generated. Accepted values are: +If an offset modifier is given by appending a number to the `rw=\fIstr\fR' +line, then this option controls how that number modifies the I/O offset +being generated. Accepted values are: .RS .RS .TP .B sequential -Generate sequential offset +Generate sequential offset. .TP .B identical -Generate the same offset +Generate the same offset. .RE .P -\fBsequential\fR is only useful for random IO, where fio would normally -generate a new random offset for every IO. If you append eg 8 to randread, you -would get a new random offset for every 8 IO's. The result would be a seek for -only every 8 IO's, instead of for every IO. Use \fBrw=randread:8\fR to specify -that. As sequential IO is already sequential, setting \fBsequential\fR for that -would not result in any differences. \fBidentical\fR behaves in a similar -fashion, except it sends the same offset 8 number of times before generating a -new offset. +\fBsequential\fR is only useful for random I/O, where fio would normally +generate a new random offset for every I/O. If you append e.g. 8 to randread, +you would get a new random offset for every 8 I/Os. The result would be a +seek for only every 8 I/Os, instead of for every I/O. Use `rw=randread:8' +to specify that. As sequential I/O is already sequential, setting +\fBsequential\fR for that would not result in any differences. \fBidentical\fR +behaves in a similar fashion, except it sends the same offset 8 number of +times before generating a new offset. .RE -.P -.TP -.BI kb_base \fR=\fPint -The base unit for a kilobyte. The defacto base is 2^10, 1024. Storage -manufacturers like to use 10^3 or 1000 as a base ten unit instead, for obvious -reasons. Allowed values are 1024 or 1000, with 1024 being the default. .TP .BI unified_rw_reporting \fR=\fPbool Fio normally reports statistics on a per data direction basis, meaning that -read, write, and trim are accounted and reported separately. If this option is -set fio sums the results and reports them as "mixed" instead. +reads, writes, and trims are accounted and reported separately. If this +option is set fio sums the results and report them as "mixed" instead. .TP .BI randrepeat \fR=\fPbool -Seed the random number generator used for random I/O patterns in a predictable -way so the pattern is repeatable across runs. Default: true. +Seed the random number generator used for random I/O patterns in a +predictable way so the pattern is repeatable across runs. Default: true. .TP .BI allrandrepeat \fR=\fPbool Seed all random number generators in a predictable way so results are -repeatable across runs. Default: false. +repeatable across runs. Default: false. .TP .BI randseed \fR=\fPint Seed the random number generators based on this seed value, to be able to @@ -370,30 +822,36 @@ sequence depends on the \fBrandrepeat\fR setting. .TP .BI fallocate \fR=\fPstr -Whether pre-allocation is performed when laying down files. Accepted values -are: +Whether pre\-allocation is performed when laying down files. +Accepted values are: .RS .RS .TP .B none -Do not pre-allocate space. +Do not pre\-allocate space. +.TP +.B native +Use a platform's native pre\-allocation call but fall back to +\fBnone\fR behavior if it fails/is not implemented. .TP .B posix -Pre-allocate via \fBposix_fallocate\fR\|(3). +Pre\-allocate via \fBposix_fallocate\fR\|(3). .TP .B keep -Pre-allocate via \fBfallocate\fR\|(2) with FALLOC_FL_KEEP_SIZE set. +Pre\-allocate via \fBfallocate\fR\|(2) with +FALLOC_FL_KEEP_SIZE set. .TP .B 0 -Backward-compatible alias for 'none'. +Backward\-compatible alias for \fBnone\fR. .TP .B 1 -Backward-compatible alias for 'posix'. +Backward\-compatible alias for \fBposix\fR. .RE .P -May not be available on all supported platforms. 'keep' is only -available on Linux. If using ZFS on Solaris this must be set to 'none' -because ZFS doesn't support it. Default: 'posix'. +May not be available on all supported platforms. \fBkeep\fR is only available +on Linux. If using ZFS on Solaris this cannot be set to \fBposix\fR +because ZFS doesn't support pre\-allocation. Default: \fBnative\fR if any +pre\-allocation methods are available, \fBnone\fR if not. .RE .TP .BI fadvise_hint \fR=\fPstr @@ -407,225 +865,569 @@ .TP .B 1 Backwards compatible hint for "advise with fio workload type". This -uses \fBFADV_RANDOM\fR for a random workload, and \fBFADV_SEQUENTIAL\fR +uses FADV_RANDOM for a random workload, and FADV_SEQUENTIAL for a sequential workload. .TP .B sequential -Advise using \fBFADV_SEQUENTIAL\fR +Advise using FADV_SEQUENTIAL. .TP .B random -Advise using \fBFADV_RANDOM\fR +Advise using FADV_RANDOM. .RE .RE .TP -.BI fadvise_stream \fR=\fPint -Use \fBposix_fadvise\fR\|(2) to advise the kernel what stream ID the -writes issued belong to. Only supported on Linux. Note, this option -may change going forward. +.BI write_hint \fR=\fPstr +Use \fBfcntl\fR\|(2) to advise the kernel what life time to expect +from a write. Only supported on Linux, as of version 4.13. Accepted +values are: +.RS +.RS .TP -.BI size \fR=\fPint -Total size of I/O for this job. \fBfio\fR will run until this many bytes have -been transferred, unless limited by other options (\fBruntime\fR, for instance, -or increased/descreased by \fBio_size\fR). Unless \fBnrfiles\fR and -\fBfilesize\fR options are given, this amount will be divided between the -available files for the job. If not set, fio will use the full size of the -given files or devices. If the files do not exist, size must be given. It is -also possible to give size as a percentage between 1 and 100. If size=20% is -given, fio will use 20% of the full size of the given files or devices. +.B none +No particular life time associated with this file. .TP -.BI io_size \fR=\fPint "\fR,\fB io_limit \fR=\fPint -Normally fio operates within the region set by \fBsize\fR, which means that -the \fBsize\fR option sets both the region and size of IO to be performed. -Sometimes that is not what you want. With this option, it is possible to -define just the amount of IO that fio should do. For instance, if \fBsize\fR -is set to 20G and \fBio_limit\fR is set to 5G, fio will perform IO within -the first 20G but exit when 5G have been done. The opposite is also -possible - if \fBsize\fR is set to 20G, and \fBio_size\fR is set to 40G, then -fio will do 40G of IO within the 0..20G region. +.B short +Data written to this file has a short life time. .TP -.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool -Sets size to something really large and waits for ENOSPC (no space left on -device) as the terminating condition. Only makes sense with sequential write. -For a read workload, the mount point will be filled first then IO started on -the result. This option doesn't make sense if operating on a raw device node, -since the size of that is already known by the file system. Additionally, -writing beyond end-of-device will not return ENOSPC there. -.TP -.BI filesize \fR=\fPirange -Individual file sizes. May be a range, in which case \fBfio\fR will select sizes -for files at random within the given range, limited to \fBsize\fR in total (if -that is given). If \fBfilesize\fR is not specified, each created file is the -same size. +.B medium +Data written to this file has a medium life time. .TP -.BI file_append \fR=\fPbool -Perform IO after the end of the file. Normally fio will operate within the -size of a file. If this option is set, then fio will append to the file -instead. This has identical behavior to setting \fRoffset\fP to the size -of a file. This option is ignored on non-regular files. +.B long +Data written to this file has a long life time. .TP -.BI blocksize \fR=\fPint[,int] "\fR,\fB bs" \fR=\fPint[,int] -Block size for I/O units. Default: 4k. Values for reads, writes, and trims -can be specified separately in the format \fIread\fR,\fIwrite\fR,\fItrim\fR -either of which may be empty to leave that value at its default. If a trailing -comma isn't given, the remainder will inherit the last value set. -.TP -.BI blocksize_range \fR=\fPirange[,irange] "\fR,\fB bsrange" \fR=\fPirange[,irange] -Specify a range of I/O block sizes. The issued I/O unit will always be a -multiple of the minimum size, unless \fBblocksize_unaligned\fR is set. Applies -to both reads and writes if only one range is given, but can be specified -separately with a comma separating the values. Example: bsrange=1k-4k,2k-8k. -Also (see \fBblocksize\fR). -.TP -.BI bssplit \fR=\fPstr -This option allows even finer grained control of the block sizes issued, -not just even splits between them. With this option, you can weight various -block sizes for exact control of the issued IO for a job that has mixed -block sizes. The format of the option is bssplit=blocksize/percentage, -optionally adding as many definitions as needed separated by a colon. -Example: bssplit=4k/10:64k/50:32k/40 would issue 50% 64k blocks, 10% 4k -blocks and 40% 32k blocks. \fBbssplit\fR also supports giving separate -splits to reads and writes. The format is identical to what the -\fBbs\fR option accepts, the read and write parts are separated with a -comma. -.TP -.B blocksize_unaligned\fR,\fP bs_unaligned -If set, any size in \fBblocksize_range\fR may be used. This typically won't -work with direct I/O, as that normally requires sector alignment. -.TP -.BI blockalign \fR=\fPint[,int] "\fR,\fB ba" \fR=\fPint[,int] -At what boundary to align random IO offsets. Defaults to the same as 'blocksize' -the minimum blocksize given. Minimum alignment is typically 512b -for using direct IO, though it usually depends on the hardware block size. -This option is mutually exclusive with using a random map for files, so it -will turn off that option. +.B extreme +Data written to this file has a very long life time. +.RE +.P +The values are all relative to each other, and no absolute meaning +should be associated with them. +.RE +.TP +.BI offset \fR=\fPint +Start I/O at the provided offset in the file, given as either a fixed size in +bytes or a percentage. If a percentage is given, the next \fBblockalign\fR\-ed +offset will be used. Data before the given offset will not be touched. This +effectively caps the file size at `real_size \- offset'. Can be combined with +\fBsize\fR to constrain the start and end range of the I/O workload. +A percentage can be specified by a number between 1 and 100 followed by '%', +for example, `offset=20%' to specify 20%. +.TP +.BI offset_increment \fR=\fPint +If this is provided, then the real offset becomes `\fBoffset\fR + \fBoffset_increment\fR +* thread_number', where the thread number is a counter that starts at 0 and +is incremented for each sub\-job (i.e. when \fBnumjobs\fR option is +specified). This option is useful if there are several jobs which are +intended to operate on a file in parallel disjoint segments, with even +spacing between the starting points. +.TP +.BI number_ios \fR=\fPint +Fio will normally perform I/Os until it has exhausted the size of the region +set by \fBsize\fR, or if it exhaust the allocated time (or hits an error +condition). With this setting, the range/size can be set independently of +the number of I/Os to perform. When fio reaches this number, it will exit +normally and report status. Note that this does not extend the amount of I/O +that will be done, it will only stop fio if this condition is met before +other end\-of\-job criteria. +.TP +.BI fsync \fR=\fPint +If writing to a file, issue an \fBfsync\fR\|(2) (or its equivalent) of +the dirty data for every number of blocks given. For example, if you give 32 +as a parameter, fio will sync the file after every 32 writes issued. If fio is +using non\-buffered I/O, we may not sync the file. The exception is the sg +I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which +means fio does not periodically issue and wait for a sync to complete. Also +see \fBend_fsync\fR and \fBfsync_on_close\fR. +.TP +.BI fdatasync \fR=\fPint +Like \fBfsync\fR but uses \fBfdatasync\fR\|(2) to only sync data and +not metadata blocks. In Windows, FreeBSD, and DragonFlyBSD there is no +\fBfdatasync\fR\|(2) so this falls back to using \fBfsync\fR\|(2). +Defaults to 0, which means fio does not periodically issue and wait for a +data\-only sync to complete. +.TP +.BI write_barrier \fR=\fPint +Make every N\-th write a barrier write. +.TP +.BI sync_file_range \fR=\fPstr:int +Use \fBsync_file_range\fR\|(2) for every \fIint\fR number of write +operations. Fio will track range of writes that have happened since the last +\fBsync_file_range\fR\|(2) call. \fIstr\fR can currently be one or more of: +.RS +.RS +.TP +.B wait_before +SYNC_FILE_RANGE_WAIT_BEFORE +.TP +.B write +SYNC_FILE_RANGE_WRITE +.TP +.B wait_after +SYNC_FILE_RANGE_WRITE_AFTER +.RE +.P +So if you do `sync_file_range=wait_before,write:8', fio would use +`SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE' for every 8 +writes. Also see the \fBsync_file_range\fR\|(2) man page. This option is +Linux specific. +.RE +.TP +.BI overwrite \fR=\fPbool +If true, writes to a file will always overwrite existing data. If the file +doesn't already exist, it will be created before the write phase begins. If +the file exists and is large enough for the specified write phase, nothing +will be done. Default: false. +.TP +.BI end_fsync \fR=\fPbool +If true, \fBfsync\fR\|(2) file contents when a write stage has completed. +Default: false. +.TP +.BI fsync_on_close \fR=\fPbool +If true, fio will \fBfsync\fR\|(2) a dirty file on close. This differs +from \fBend_fsync\fR in that it will happen on every file close, not +just at the end of the job. Default: false. +.TP +.BI rwmixread \fR=\fPint +Percentage of a mixed workload that should be reads. Default: 50. +.TP +.BI rwmixwrite \fR=\fPint +Percentage of a mixed workload that should be writes. If both +\fBrwmixread\fR and \fBrwmixwrite\fR is given and the values do not +add up to 100%, the latter of the two will be used to override the +first. This may interfere with a given rate setting, if fio is asked to +limit reads or writes to a certain rate. If that is the case, then the +distribution may be skewed. Default: 50. +.TP +.BI random_distribution \fR=\fPstr:float[,str:float][,str:float] +By default, fio will use a completely uniform random distribution when asked +to perform random I/O. Sometimes it is useful to skew the distribution in +specific ways, ensuring that some parts of the data is more hot than others. +fio includes the following distribution models: +.RS +.RS +.TP +.B random +Uniform random distribution +.TP +.B zipf +Zipf distribution +.TP +.B pareto +Pareto distribution +.TP +.B normal +Normal (Gaussian) distribution +.TP +.B zoned +Zoned random distribution +.RE +.P +When using a \fBzipf\fR or \fBpareto\fR distribution, an input value is also +needed to define the access pattern. For \fBzipf\fR, this is the `Zipf theta'. +For \fBpareto\fR, it's the `Pareto power'. Fio includes a test +program, \fBfio\-genzipf\fR, that can be used visualize what the given input +values will yield in terms of hit rates. If you wanted to use \fBzipf\fR with +a `theta' of 1.2, you would use `random_distribution=zipf:1.2' as the +option. If a non\-uniform model is used, fio will disable use of the random +map. For the \fBnormal\fR distribution, a normal (Gaussian) deviation is +supplied as a value between 0 and 100. +.P +For a \fBzoned\fR distribution, fio supports specifying percentages of I/O +access that should fall within what range of the file or device. For +example, given a criteria of: +.RS +.P +.PD 0 +60% of accesses should be to the first 10% +.P +30% of accesses should be to the next 20% +.P +8% of accesses should be to the next 30% +.P +2% of accesses should be to the next 40% +.PD +.RE +.P +we can define that through zoning of the random accesses. For the above +example, the user would do: +.RS +.P +random_distribution=zoned:60/10:30/20:8/30:2/40 +.RE +.P +similarly to how \fBbssplit\fR works for setting ranges and percentages +of block sizes. Like \fBbssplit\fR, it's possible to specify separate +zones for reads, writes, and trims. If just one set is given, it'll apply to +all of them. +.RE +.TP +.BI percentage_random \fR=\fPint[,int][,int] +For a random workload, set how big a percentage should be random. This +defaults to 100%, in which case the workload is fully random. It can be set +from anywhere from 0 to 100. Setting it to 0 would make the workload fully +sequential. Any setting in between will result in a random mix of sequential +and random I/O, at the given percentages. Comma\-separated values may be +specified for reads, writes, and trims as described in \fBblocksize\fR. +.TP +.BI norandommap +Normally fio will cover every block of the file when doing random I/O. If +this option is given, fio will just get a new random offset without looking +at past I/O history. This means that some blocks may not be read or written, +and that some blocks may be read/written more than once. If this option is +used with \fBverify\fR and multiple blocksizes (via \fBbsrange\fR), +only intact blocks are verified, i.e., partially\-overwritten blocks are +ignored. +.TP +.BI softrandommap \fR=\fPbool +See \fBnorandommap\fR. If fio runs with the random block map enabled and +it fails to allocate the map, if this option is set it will continue without +a random block map. As coverage will not be as complete as with random maps, +this option is disabled by default. +.TP +.BI random_generator \fR=\fPstr +Fio supports the following engines for generating I/O offsets for random I/O: +.RS +.RS +.TP +.B tausworthe +Strong 2^88 cycle random number generator. +.TP +.B lfsr +Linear feedback shift register generator. +.TP +.B tausworthe64 +Strong 64\-bit 2^258 cycle random number generator. +.RE +.P +\fBtausworthe\fR is a strong random number generator, but it requires tracking +on the side if we want to ensure that blocks are only read or written +once. \fBlfsr\fR guarantees that we never generate the same offset twice, and +it's also less computationally expensive. It's not a true random generator, +however, though for I/O purposes it's typically good enough. \fBlfsr\fR only +works with single block sizes, not with workloads that use multiple block +sizes. If used with such a workload, fio may read or write some blocks +multiple times. The default value is \fBtausworthe\fR, unless the required +space exceeds 2^32 blocks. If it does, then \fBtausworthe64\fR is +selected automatically. +.RE +.SS "Block size" +.TP +.BI blocksize \fR=\fPint[,int][,int] "\fR,\fB bs" \fR=\fPint[,int][,int] +The block size in bytes used for I/O units. Default: 4096. A single value +applies to reads, writes, and trims. Comma\-separated values may be +specified for reads, writes, and trims. A value not terminated in a comma +applies to subsequent types. Examples: +.RS +.RS +.P +.PD 0 +bs=256k means 256k for reads, writes and trims. +.P +bs=8k,32k means 8k for reads, 32k for writes and trims. +.P +bs=8k,32k, means 8k for reads, 32k for writes, and default for trims. +.P +bs=,8k means default for reads, 8k for writes and trims. +.P +bs=,8k, means default for reads, 8k for writes, and default for trims. +.PD +.RE +.RE +.TP +.BI blocksize_range \fR=\fPirange[,irange][,irange] "\fR,\fB bsrange" \fR=\fPirange[,irange][,irange] +A range of block sizes in bytes for I/O units. The issued I/O unit will +always be a multiple of the minimum size, unless +\fBblocksize_unaligned\fR is set. +Comma\-separated ranges may be specified for reads, writes, and trims as +described in \fBblocksize\fR. Example: +.RS +.RS +.P +bsrange=1k\-4k,2k\-8k +.RE +.RE +.TP +.BI bssplit \fR=\fPstr[,str][,str] +Sometimes you want even finer grained control of the block sizes issued, not +just an even split between them. This option allows you to weight various +block sizes, so that you are able to define a specific amount of block sizes +issued. The format for this option is: +.RS +.RS +.P +bssplit=blocksize/percentage:blocksize/percentage +.RE +.P +for as many block sizes as needed. So if you want to define a workload that +has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would write: +.RS +.P +bssplit=4k/10:64k/50:32k/40 +.RE +.P +Ordering does not matter. If the percentage is left blank, fio will fill in +the remaining values evenly. So a bssplit option like this one: +.RS +.P +bssplit=4k/50:1k/:32k/ +.RE +.P +would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always add up +to 100, if bssplit is given a range that adds up to more, it will error out. +.P +Comma\-separated values may be specified for reads, writes, and trims as +described in \fBblocksize\fR. +.P +If you want a workload that has 50% 2k reads and 50% 4k reads, while having +90% 4k writes and 10% 8k writes, you would specify: +.RS +.P +bssplit=2k/50:4k/50,4k/90,8k/10 +.RE +.RE +.TP +.BI blocksize_unaligned "\fR,\fB bs_unaligned" +If set, fio will issue I/O units with any size within +\fBblocksize_range\fR, not just multiples of the minimum size. This +typically won't work with direct I/O, as that normally requires sector +alignment. .TP .BI bs_is_seq_rand \fR=\fPbool -If this option is set, fio will use the normal read,write blocksize settings as -sequential,random instead. Any random read or write will use the WRITE -blocksize settings, and any sequential read or write will use the READ -blocksize setting. +If this option is set, fio will use the normal read,write blocksize settings +as sequential,random blocksize settings instead. Any random read or write +will use the WRITE blocksize settings, and any sequential read or write will +use the READ blocksize settings. +.TP +.BI blockalign \fR=\fPint[,int][,int] "\fR,\fB ba" \fR=\fPint[,int][,int] +Boundary to which fio will align random I/O units. Default: +\fBblocksize\fR. Minimum alignment is typically 512b for using direct +I/O, though it usually depends on the hardware block size. This option is +mutually exclusive with using a random map for files, so it will turn off +that option. Comma\-separated values may be specified for reads, writes, and +trims as described in \fBblocksize\fR. +.SS "Buffers and memory" .TP -.B zero_buffers +.BI zero_buffers Initialize buffers with all zeros. Default: fill buffers with random data. .TP -.B refill_buffers -If this option is given, fio will refill the IO buffers on every submit. The -default is to only fill it at init time and reuse that data. Only makes sense -if zero_buffers isn't specified, naturally. If data verification is enabled, -refill_buffers is also automatically enabled. +.BI refill_buffers +If this option is given, fio will refill the I/O buffers on every +submit. The default is to only fill it at init time and reuse that +data. Only makes sense if zero_buffers isn't specified, naturally. If data +verification is enabled, \fBrefill_buffers\fR is also automatically enabled. .TP .BI scramble_buffers \fR=\fPbool If \fBrefill_buffers\fR is too costly and the target is using data -deduplication, then setting this option will slightly modify the IO buffer -contents to defeat normal de-dupe attempts. This is not enough to defeat -more clever block compression attempts, but it will stop naive dedupe -of blocks. Default: true. +deduplication, then setting this option will slightly modify the I/O buffer +contents to defeat normal de\-dupe attempts. This is not enough to defeat +more clever block compression attempts, but it will stop naive dedupe of +blocks. Default: true. .TP .BI buffer_compress_percentage \fR=\fPint -If this is set, then fio will attempt to provide IO buffer content (on WRITEs) -that compress to the specified level. Fio does this by providing a mix of -random data and a fixed pattern. The fixed pattern is either zeroes, or the -pattern specified by \fBbuffer_pattern\fR. If the pattern option is used, it -might skew the compression ratio slightly. Note that this is per block size -unit, for file/disk wide compression level that matches this setting. Note -that this is per block size unit, for file/disk wide compression level that -matches this setting, you'll also want to set refill_buffers. +If this is set, then fio will attempt to provide I/O buffer content (on +WRITEs) that compresses to the specified level. Fio does this by providing a +mix of random data and a fixed pattern. The fixed pattern is either zeros, +or the pattern specified by \fBbuffer_pattern\fR. If the pattern option +is used, it might skew the compression ratio slightly. Note that this is per +block size unit, for file/disk wide compression level that matches this +setting, you'll also want to set \fBrefill_buffers\fR. .TP .BI buffer_compress_chunk \fR=\fPint -See \fBbuffer_compress_percentage\fR. This setting allows fio to manage how -big the ranges of random data and zeroed data is. Without this set, fio will -provide \fBbuffer_compress_percentage\fR of blocksize random data, followed by -the remaining zeroed. With this set to some chunk size smaller than the block -size, fio can alternate random and zeroed data throughout the IO buffer. +See \fBbuffer_compress_percentage\fR. This setting allows fio to manage +how big the ranges of random data and zeroed data is. Without this set, fio +will provide \fBbuffer_compress_percentage\fR of blocksize random data, +followed by the remaining zeroed. With this set to some chunk size smaller +than the block size, fio can alternate random and zeroed data throughout the +I/O buffer. .TP .BI buffer_pattern \fR=\fPstr -If set, fio will fill the IO buffers with this pattern. If not set, the contents -of IO buffers is defined by the other options related to buffer contents. The -setting can be any pattern of bytes, and can be prefixed with 0x for hex -values. It may also be a string, where the string must then be wrapped with -"", e.g.: -.RS +If set, fio will fill the I/O buffers with this pattern or with the contents +of a file. If not set, the contents of I/O buffers are defined by the other +options related to buffer contents. The setting can be any pattern of bytes, +and can be prefixed with 0x for hex values. It may also be a string, where +the string must then be wrapped with "". Or it may also be a filename, +where the filename must be wrapped with '' in which case the file is +opened and read. Note that not all the file contents will be read if that +would cause the buffers to overflow. So, for example: .RS -\fBbuffer_pattern\fR="abcd" .RS -or -.RE -\fBbuffer_pattern\fR=-12 -.RS -or -.RE -\fBbuffer_pattern\fR=0xdeadface +.P +.PD 0 +buffer_pattern='filename' +.P +or: +.P +buffer_pattern="abcd" +.P +or: +.P +buffer_pattern=\-12 +.P +or: +.P +buffer_pattern=0xdeadface +.PD .RE -.LP +.P Also you can combine everything together in any order: -.LP .RS -\fBbuffer_pattern\fR=0xdeadface"abcd"-12 +.P +buffer_pattern=0xdeadface"abcd"\-12'filename' .RE .RE .TP .BI dedupe_percentage \fR=\fPint -If set, fio will generate this percentage of identical buffers when writing. -These buffers will be naturally dedupable. The contents of the buffers depend -on what other buffer compression settings have been set. It's possible to have -the individual buffers either fully compressible, or not at all. This option -only controls the distribution of unique buffers. +If set, fio will generate this percentage of identical buffers when +writing. These buffers will be naturally dedupable. The contents of the +buffers depend on what other buffer compression settings have been set. It's +possible to have the individual buffers either fully compressible, or not at +all. This option only controls the distribution of unique buffers. .TP -.BI nrfiles \fR=\fPint -Number of files to use for this job. Default: 1. +.BI invalidate \fR=\fPbool +Invalidate the buffer/page cache parts of the files to be used prior to +starting I/O if the platform and file type support it. Defaults to true. +This will be ignored if \fBpre_read\fR is also specified for the +same job. .TP -.BI openfiles \fR=\fPint -Number of files to keep open at the same time. Default: \fBnrfiles\fR. +.BI sync \fR=\fPbool +Use synchronous I/O for buffered writes. For the majority of I/O engines, +this means using O_SYNC. Default: false. .TP -.BI file_service_type \fR=\fPstr -Defines how files to service are selected. The following types are defined: +.BI iomem \fR=\fPstr "\fR,\fP mem" \fR=\fPstr +Fio can use various types of memory as the I/O unit buffer. The allowed +values are: .RS .RS .TP -.B random -Choose a file at random. +.B malloc +Use memory from \fBmalloc\fR\|(3) as the buffers. Default memory type. .TP -.B roundrobin -Round robin over opened files (default). +.B shm +Use shared memory as the buffers. Allocated through \fBshmget\fR\|(2). .TP -.B sequential -Do each file in the set sequentially. +.B shmhuge +Same as \fBshm\fR, but use huge pages as backing. .TP -.B zipf -Use a zipfian distribution to decide what file to access. +.B mmap +Use \fBmmap\fR\|(2) to allocate buffers. May either be anonymous memory, or can +be file backed if a filename is given after the option. The format +is `mem=mmap:/path/to/file'. .TP -.B pareto -Use a pareto distribution to decide what file to access. +.B mmaphuge +Use a memory mapped huge file as the buffer backing. Append filename +after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file'. .TP -.B gauss -Use a gaussian (normal) distribution to decide what file to access. -.RE -.P -For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be -appended to tell fio how many I/Os to issue before switching to a new file. -For example, specifying \fBfile_service_type=random:8\fR would cause fio to -issue \fI8\fR I/Os before selecting a new file at random. For the non-uniform -distributions, a floating point postfix can be given to influence how the -distribution is skewed. See \fBrandom_distribution\fR for a description of how -that would work. +.B mmapshared +Same as \fBmmap\fR, but use a MMAP_SHARED mapping. +.TP +.B cudamalloc +Use GPU memory as the buffers for GPUDirect RDMA benchmark. +The \fBioengine\fR must be \fBrdma\fR. +.RE +.P +The area allocated is a function of the maximum allowed bs size for the job, +multiplied by the I/O depth given. Note that for \fBshmhuge\fR and +\fBmmaphuge\fR to work, the system must have free huge pages allocated. This +can normally be checked and set by reading/writing +`/proc/sys/vm/nr_hugepages' on a Linux system. Fio assumes a huge page +is 4MiB in size. So to calculate the number of huge pages you need for a +given job file, add up the I/O depth of all jobs (normally one unless +\fBiodepth\fR is used) and multiply by the maximum bs set. Then divide +that number by the huge page size. You can see the size of the huge pages in +`/proc/meminfo'. If no huge pages are allocated by having a non\-zero +number in `nr_hugepages', using \fBmmaphuge\fR or \fBshmhuge\fR will fail. Also +see \fBhugepage\-size\fR. +.P +\fBmmaphuge\fR also needs to have hugetlbfs mounted and the file location +should point there. So if it's mounted in `/huge', you would use +`mem=mmaphuge:/huge/somefile'. .RE .TP +.BI iomem_align \fR=\fPint "\fR,\fP mem_align" \fR=\fPint +This indicates the memory alignment of the I/O memory buffers. Note that +the given alignment is applied to the first I/O unit buffer, if using +\fBiodepth\fR the alignment of the following buffers are given by the +\fBbs\fR used. In other words, if using a \fBbs\fR that is a +multiple of the page sized in the system, all buffers will be aligned to +this value. If using a \fBbs\fR that is not page aligned, the alignment +of subsequent I/O memory buffers is the sum of the \fBiomem_align\fR and +\fBbs\fR used. +.TP +.BI hugepage\-size \fR=\fPint +Defines the size of a huge page. Must at least be equal to the system +setting, see `/proc/meminfo'. Defaults to 4MiB. Should probably +always be a multiple of megabytes, so using `hugepage\-size=Xm' is the +preferred way to set this to avoid setting a non\-pow\-2 bad value. +.TP +.BI lockmem \fR=\fPint +Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to +simulate a smaller amount of memory. The amount specified is per worker. +.SS "I/O size" +.TP +.BI size \fR=\fPint +The total size of file I/O for each thread of this job. Fio will run until +this many bytes has been transferred, unless runtime is limited by other options +(such as \fBruntime\fR, for instance, or increased/decreased by \fBio_size\fR). +Fio will divide this size between the available files determined by options +such as \fBnrfiles\fR, \fBfilename\fR, unless \fBfilesize\fR is +specified by the job. If the result of division happens to be 0, the size is +set to the physical size of the given files or devices if they exist. +If this option is not specified, fio will use the full size of the given +files or devices. If the files do not exist, size must be given. It is also +possible to give size as a percentage between 1 and 100. If `size=20%' is +given, fio will use 20% of the full size of the given files or devices. +Can be combined with \fBoffset\fR to constrain the start and end range +that I/O will be done within. +.TP +.BI io_size \fR=\fPint "\fR,\fB io_limit" \fR=\fPint +Normally fio operates within the region set by \fBsize\fR, which means +that the \fBsize\fR option sets both the region and size of I/O to be +performed. Sometimes that is not what you want. With this option, it is +possible to define just the amount of I/O that fio should do. For instance, +if \fBsize\fR is set to 20GiB and \fBio_size\fR is set to 5GiB, fio +will perform I/O within the first 20GiB but exit when 5GiB have been +done. The opposite is also possible \-\- if \fBsize\fR is set to 20GiB, +and \fBio_size\fR is set to 40GiB, then fio will do 40GiB of I/O within +the 0..20GiB region. +.TP +.BI filesize \fR=\fPirange(int) +Individual file sizes. May be a range, in which case fio will select sizes +for files at random within the given range and limited to \fBsize\fR in +total (if that is given). If not given, each created file is the same size. +This option overrides \fBsize\fR in terms of file size, which means +this value is used as a fixed size or possible range of each file. +.TP +.BI file_append \fR=\fPbool +Perform I/O after the end of the file. Normally fio will operate within the +size of a file. If this option is set, then fio will append to the file +instead. This has identical behavior to setting \fBoffset\fR to the size +of a file. This option is ignored on non\-regular files. +.TP +.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool +Sets size to something really large and waits for ENOSPC (no space left on +device) as the terminating condition. Only makes sense with sequential +write. For a read workload, the mount point will be filled first then I/O +started on the result. This option doesn't make sense if operating on a raw +device node, since the size of that is already known by the file system. +Additionally, writing beyond end\-of\-device will not return ENOSPC there. +.SS "I/O engine" +.TP .BI ioengine \fR=\fPstr -Defines how the job issues I/O. The following types are defined: +Defines how the job issues I/O to the file. The following types are defined: .RS .RS .TP .B sync -Basic \fBread\fR\|(2) or \fBwrite\fR\|(2) I/O. \fBfseek\fR\|(2) is used to -position the I/O location. +Basic \fBread\fR\|(2) or \fBwrite\fR\|(2) +I/O. \fBlseek\fR\|(2) is used to position the I/O location. +See \fBfsync\fR and \fBfdatasync\fR for syncing write I/Os. .TP .B psync -Basic \fBpread\fR\|(2) or \fBpwrite\fR\|(2) I/O. -Default on all supported operating systems except for Windows. +Basic \fBpread\fR\|(2) or \fBpwrite\fR\|(2) I/O. Default on +all supported operating systems except for Windows. .TP .B vsync -Basic \fBreadv\fR\|(2) or \fBwritev\fR\|(2) I/O. Will emulate queuing by -coalescing adjacent IOs into a single submission. +Basic \fBreadv\fR\|(2) or \fBwritev\fR\|(2) I/O. Will emulate +queuing by coalescing adjacent I/Os into a single submission. .TP .B pvsync Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O. @@ -634,10 +1436,14 @@ Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O. .TP .B libaio -Linux native asynchronous I/O. This ioengine defines engine specific options. +Linux native asynchronous I/O. Note that Linux may only support +queued behavior with non\-buffered I/O (set `direct=1' or +`buffered=0'). +This engine defines engine specific options. .TP .B posixaio -POSIX asynchronous I/O using \fBaio_read\fR\|(3) and \fBaio_write\fR\|(3). +POSIX asynchronous I/O using \fBaio_read\fR\|(3) and +\fBaio_write\fR\|(3). .TP .B solarisaio Solaris native asynchronous I/O. @@ -646,459 +1452,554 @@ Windows native asynchronous I/O. Default on Windows. .TP .B mmap -File is memory mapped with \fBmmap\fR\|(2) and data copied using -\fBmemcpy\fR\|(3). +File is memory mapped with \fBmmap\fR\|(2) and data copied +to/from using \fBmemcpy\fR\|(3). .TP .B splice -\fBsplice\fR\|(2) is used to transfer the data and \fBvmsplice\fR\|(2) to -transfer data from user-space to the kernel. +\fBsplice\fR\|(2) is used to transfer the data and +\fBvmsplice\fR\|(2) to transfer data from user space to the +kernel. .TP .B sg -SCSI generic sg v3 I/O. May be either synchronous using the SG_IO ioctl, or if -the target is an sg character device, we use \fBread\fR\|(2) and -\fBwrite\fR\|(2) for asynchronous I/O. +SCSI generic sg v3 I/O. May either be synchronous using the SG_IO +ioctl, or if the target is an sg character device we use +\fBread\fR\|(2) and \fBwrite\fR\|(2) for asynchronous +I/O. Requires \fBfilename\fR option to specify either block or +character devices. .TP .B null -Doesn't transfer any data, just pretends to. Mainly used to exercise \fBfio\fR -itself and for debugging and testing purposes. +Doesn't transfer any data, just pretends to. This is mainly used to +exercise fio itself and for debugging/testing purposes. .TP .B net -Transfer over the network. The protocol to be used can be defined with the -\fBprotocol\fR parameter. Depending on the protocol, \fBfilename\fR, -\fBhostname\fR, \fBport\fR, or \fBlisten\fR must be specified. -This ioengine defines engine specific options. +Transfer over the network to given `host:port'. Depending on the +\fBprotocol\fR used, the \fBhostname\fR, \fBport\fR, +\fBlisten\fR and \fBfilename\fR options are used to specify +what sort of connection to make, while the \fBprotocol\fR option +determines which protocol will be used. This engine defines engine +specific options. .TP .B netsplice -Like \fBnet\fR, but uses \fBsplice\fR\|(2) and \fBvmsplice\fR\|(2) to map data -and send/receive. This ioengine defines engine specific options. +Like \fBnet\fR, but uses \fBsplice\fR\|(2) and +\fBvmsplice\fR\|(2) to map data and send/receive. +This engine defines engine specific options. .TP .B cpuio -Doesn't transfer any data, but burns CPU cycles according to \fBcpuload\fR and -\fBcpuchunks\fR parameters. A job never finishes unless there is at least one -non-cpuio job. +Doesn't transfer any data, but burns CPU cycles according to the +\fBcpuload\fR and \fBcpuchunks\fR options. Setting +\fBcpuload\fR\=85 will cause that job to do nothing but burn 85% +of the CPU. In case of SMP machines, use `numjobs=' +to get desired CPU usage, as the cpuload only loads a +single CPU at the desired rate. A job never finishes unless there is +at least one non\-cpuio job. .TP .B guasi -The GUASI I/O engine is the Generic Userspace Asynchronous Syscall Interface -approach to asynchronous I/O. -.br -See . +The GUASI I/O engine is the Generic Userspace Asyncronous Syscall +Interface approach to async I/O. See \fIhttp://www.xmailserver.org/guasi\-lib.html\fR +for more info on GUASI. .TP .B rdma -The RDMA I/O engine supports both RDMA memory semantics (RDMA_WRITE/RDMA_READ) -and channel semantics (Send/Recv) for the InfiniBand, RoCE and iWARP protocols. -.TP -.B external -Loads an external I/O engine object file. Append the engine filename as -`:\fIenginepath\fR'. +The RDMA I/O engine supports both RDMA memory semantics +(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the +InfiniBand, RoCE and iWARP protocols. .TP .B falloc - IO engine that does regular linux native fallocate call to simulate data -transfer as fio ioengine -.br - DDIR_READ does fallocate(,mode = FALLOC_FL_KEEP_SIZE,) -.br - DIR_WRITE does fallocate(,mode = 0) -.br - DDIR_TRIM does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE) +I/O engine that does regular fallocate to simulate data transfer as +fio ioengine. +.RS +.P +.PD 0 +DDIR_READ does fallocate(,mode = FALLOC_FL_KEEP_SIZE,). +.P +DIR_WRITE does fallocate(,mode = 0). +.P +DDIR_TRIM does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE). +.PD +.RE +.TP +.B ftruncate +I/O engine that sends \fBftruncate\fR\|(2) operations in response +to write (DDIR_WRITE) events. Each ftruncate issued sets the file's +size to the current block offset. \fBblocksize\fR is ignored. .TP .B e4defrag -IO engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate defragment activity -request to DDIR_WRITE event +I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate +defragment activity in request to DDIR_WRITE event. .TP .B rbd -IO engine supporting direct access to Ceph Rados Block Devices (RBD) via librbd -without the need to use the kernel rbd driver. This ioengine defines engine specific -options. +I/O engine supporting direct access to Ceph Rados Block Devices +(RBD) via librbd without the need to use the kernel rbd driver. This +ioengine defines engine specific options. .TP .B gfapi -Using Glusterfs libgfapi sync interface to direct access to Glusterfs volumes without -having to go through FUSE. This ioengine defines engine specific -options. +Using GlusterFS libgfapi sync interface to direct access to +GlusterFS volumes without having to go through FUSE. This ioengine +defines engine specific options. .TP .B gfapi_async -Using Glusterfs libgfapi async interface to direct access to Glusterfs volumes without -having to go through FUSE. This ioengine defines engine specific -options. +Using GlusterFS libgfapi async interface to direct access to +GlusterFS volumes without having to go through FUSE. This ioengine +defines engine specific options. .TP .B libhdfs -Read and write through Hadoop (HDFS). The \fBfilename\fR option is used to -specify host,port of the hdfs name-node to connect. This engine interprets -offsets a little differently. In HDFS, files once created cannot be modified. -So random writes are not possible. To imitate this, libhdfs engine expects -bunch of small files to be created over HDFS, and engine will randomly pick a -file out of those files based on the offset generated by fio backend. (see the -example job file to create such files, use rw=write option). Please note, you -might want to set necessary environment variables to work with hdfs/libhdfs -properly. +Read and write through Hadoop (HDFS). The \fBfilename\fR option +is used to specify host,port of the hdfs name\-node to connect. This +engine interprets offsets a little differently. In HDFS, files once +created cannot be modified so random writes are not possible. To +imitate this the libhdfs engine expects a bunch of small files to be +created over HDFS and will randomly pick a file from them +based on the offset generated by fio backend (see the example +job file to create such files, use `rw=write' option). Please +note, it may be necessary to set environment variables to work +with HDFS/libhdfs properly. Each job uses its own connection to +HDFS. .TP .B mtd -Read, write and erase an MTD character device (e.g., /dev/mtd0). Discards are -treated as erases. Depending on the underlying device type, the I/O may have -to go in a certain pattern, e.g., on NAND, writing sequentially to erase blocks -and discarding before overwriting. The writetrim mode works well for this +Read, write and erase an MTD character device (e.g., +`/dev/mtd0'). Discards are treated as erases. Depending on the +underlying device type, the I/O may have to go in a certain pattern, +e.g., on NAND, writing sequentially to erase blocks and discarding +before overwriting. The \fBtrimwrite\fR mode works well for this constraint. .TP .B pmemblk -Read and write through the NVML libpmemblk interface. +Read and write using filesystem DAX to a file on a filesystem +mounted with DAX on a persistent memory device through the NVML +libpmemblk library. +.TP +.B dev\-dax +Read and write using device DAX to a persistent memory device (e.g., +/dev/dax0.0) through the NVML libpmem library. .TP -.B dev-dax -Read and write through a DAX device exposed from persistent memory. -.RE -.P -.RE -.TP -.BI iodepth \fR=\fPint -Number of I/O units to keep in flight against the file. Note that increasing -iodepth beyond 1 will not affect synchronous ioengines (except for small -degress when verify_async is in use). Even async engines may impose OS -restrictions causing the desired depth not to be achieved. This may happen on -Linux when using libaio and not setting \fBdirect\fR=1, since buffered IO is -not async on that OS. Keep an eye on the IO depth distribution in the -fio output to verify that the achieved depth is as expected. Default: 1. -.TP -.BI iodepth_batch \fR=\fPint "\fR,\fP iodepth_batch_submit" \fR=\fPint -This defines how many pieces of IO to submit at once. It defaults to 1 -which means that we submit each IO as soon as it is available, but can -be raised to submit bigger batches of IO at the time. If it is set to 0 -the \fBiodepth\fR value will be used. -.TP -.BI iodepth_batch_complete_min \fR=\fPint "\fR,\fP iodepth_batch_complete" \fR=\fPint -This defines how many pieces of IO to retrieve at once. It defaults to 1 which - means that we'll ask for a minimum of 1 IO in the retrieval process from the -kernel. The IO retrieval will go on until we hit the limit set by -\fBiodepth_low\fR. If this variable is set to 0, then fio will always check for -completed events before queuing more IO. This helps reduce IO latency, at the -cost of more retrieval system calls. +.B external +Prefix to specify loading an external I/O engine object file. Append +the engine filename, e.g. `ioengine=external:/tmp/foo.o' to load +ioengine `foo.o' in `/tmp'. The path can be either +absolute or relative. See `engines/skeleton_external.c' in the fio source for +details of writing an external I/O engine. +.SS "I/O engine specific parameters" +In addition, there are some parameters which are only valid when a specific +\fBioengine\fR is in use. These are used identically to normal parameters, +with the caveat that when used on the command line, they must come after the +\fBioengine\fR that defines them is selected. .TP -.BI iodepth_batch_complete_max \fR=\fPint -This defines maximum pieces of IO to -retrieve at once. This variable should be used along with -\fBiodepth_batch_complete_min\fR=int variable, specifying the range -of min and max amount of IO which should be retrieved. By default -it is equal to \fBiodepth_batch_complete_min\fR value. - -Example #1: -.RS -.RS -\fBiodepth_batch_complete_min\fR=1 -.LP -\fBiodepth_batch_complete_max\fR= -.RE - -which means that we will retrieve at least 1 IO and up to the -whole submitted queue depth. If none of IO has been completed -yet, we will wait. - -Example #2: -.RS -\fBiodepth_batch_complete_min\fR=0 -.LP -\fBiodepth_batch_complete_max\fR= -.RE - -which means that we can retrieve up to the whole submitted -queue depth, but if none of IO has been completed yet, we will -NOT wait and immediately exit the system call. In this example -we simply do polling. -.RE +.BI (libaio)userspace_reap +Normally, with the libaio engine in use, fio will use the +\fBio_getevents\fR\|(3) system call to reap newly returned events. With +this flag turned on, the AIO ring will be read directly from user\-space to +reap events. The reaping mode is only enabled when polling for a minimum of +0 events (e.g. when `iodepth_batch_complete=0'). .TP -.BI iodepth_low \fR=\fPint -Low watermark indicating when to start filling the queue again. Default: -\fBiodepth\fR. +.BI (pvsync2)hipri +Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority +than normal. .TP -.BI io_submit_mode \fR=\fPstr -This option controls how fio submits the IO to the IO engine. The default is -\fBinline\fR, which means that the fio job threads submit and reap IO directly. -If set to \fBoffload\fR, the job threads will offload IO submission to a -dedicated pool of IO threads. This requires some coordination and thus has a -bit of extra overhead, especially for lower queue depth IO where it can -increase latencies. The benefit is that fio can manage submission rates -independently of the device completion rates. This avoids skewed latency -reporting if IO gets back up on the device side (the coordinated omission -problem). +.BI (pvsync2)hipri_percentage +When hipri is set this determines the probability of a pvsync2 I/O being high +priority. The default is 100%. .TP -.BI direct \fR=\fPbool -If true, use non-buffered I/O (usually O_DIRECT). Default: false. +.BI (cpuio)cpuload \fR=\fPint +Attempt to use the specified percentage of CPU cycles. This is a mandatory +option when using cpuio I/O engine. .TP -.BI atomic \fR=\fPbool -If value is true, attempt to use atomic direct IO. Atomic writes are guaranteed -to be stable once acknowledged by the operating system. Only Linux supports -O_ATOMIC right now. +.BI (cpuio)cpuchunks \fR=\fPint +Split the load into cycles of the given time. In microseconds. .TP -.BI buffered \fR=\fPbool -If true, use buffered I/O. This is the opposite of the \fBdirect\fR parameter. -Default: true. +.BI (cpuio)exit_on_io_done \fR=\fPbool +Detect when I/O threads are done, then exit. .TP -.BI offset \fR=\fPint -Offset in the file to start I/O. Data before the offset will not be touched. +.BI (libhdfs)namenode \fR=\fPstr +The hostname or IP address of a HDFS cluster namenode to contact. .TP -.BI offset_increment \fR=\fPint -If this is provided, then the real offset becomes the -offset + offset_increment * thread_number, where the thread number is a -counter that starts at 0 and is incremented for each sub-job (i.e. when -numjobs option is specified). This option is useful if there are several jobs -which are intended to operate on a file in parallel disjoint segments, with -even spacing between the starting points. +.BI (libhdfs)port +The listening port of the HFDS cluster namenode. .TP -.BI number_ios \fR=\fPint -Fio will normally perform IOs until it has exhausted the size of the region -set by \fBsize\fR, or if it exhaust the allocated time (or hits an error -condition). With this setting, the range/size can be set independently of -the number of IOs to perform. When fio reaches this number, it will exit -normally and report status. Note that this does not extend the amount -of IO that will be done, it will only stop fio if this condition is met -before other end-of-job criteria. +.BI (netsplice,net)port +The TCP or UDP port to bind to or connect to. If this is used with +\fBnumjobs\fR to spawn multiple instances of the same job type, then +this will be the starting port number since fio will use a range of +ports. .TP -.BI fsync \fR=\fPint -How many I/Os to perform before issuing an \fBfsync\fR\|(2) of dirty data. If -0, don't sync. Default: 0. +.BI (netsplice,net)hostname \fR=\fPstr +The hostname or IP address to use for TCP or UDP based I/O. If the job is +a TCP listener or UDP reader, the hostname is not used and must be omitted +unless it is a valid UDP multicast address. +.TP +.BI (netsplice,net)interface \fR=\fPstr +The IP address of the network interface used to send or receive UDP +multicast. .TP -.BI fdatasync \fR=\fPint -Like \fBfsync\fR, but uses \fBfdatasync\fR\|(2) instead to only sync the -data parts of the file. Default: 0. +.BI (netsplice,net)ttl \fR=\fPint +Time\-to\-live value for outgoing UDP multicast packets. Default: 1. .TP -.BI write_barrier \fR=\fPint -Make every Nth write a barrier write. +.BI (netsplice,net)nodelay \fR=\fPbool +Set TCP_NODELAY on TCP connections. .TP -.BI sync_file_range \fR=\fPstr:int -Use \fBsync_file_range\fR\|(2) for every \fRval\fP number of write operations. Fio will -track range of writes that have happened since the last \fBsync_file_range\fR\|(2) call. -\fRstr\fP can currently be one or more of: +.BI (netsplice,net)protocol \fR=\fPstr "\fR,\fP proto" \fR=\fPstr +The network protocol to use. Accepted values are: +.RS .RS .TP -.B wait_before -SYNC_FILE_RANGE_WAIT_BEFORE +.B tcp +Transmission control protocol. .TP -.B write -SYNC_FILE_RANGE_WRITE +.B tcpv6 +Transmission control protocol V6. .TP -.B wait_after -SYNC_FILE_RANGE_WRITE +.B udp +User datagram protocol. .TP +.B udpv6 +User datagram protocol V6. +.TP +.B unix +UNIX domain socket. .RE .P -So if you do sync_file_range=wait_before,write:8, fio would use -\fBSYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE\fP for every 8 writes. -Also see the \fBsync_file_range\fR\|(2) man page. This option is Linux specific. +When the protocol is TCP or UDP, the port must also be given, as well as the +hostname if the job is a TCP listener or UDP reader. For unix sockets, the +normal \fBfilename\fR option should be used and the port is invalid. +.RE +.TP +.BI (netsplice,net)listen +For TCP network connections, tell fio to listen for incoming connections +rather than initiating an outgoing connection. The \fBhostname\fR must +be omitted if this option is used. +.TP +.BI (netsplice,net)pingpong +Normally a network writer will just continue writing data, and a network +reader will just consume packages. If `pingpong=1' is set, a writer will +send its normal payload to the reader, then wait for the reader to send the +same payload back. This allows fio to measure network latencies. The +submission and completion latencies then measure local time spent sending or +receiving, and the completion latency measures how long it took for the +other end to receive and send back. For UDP multicast traffic +`pingpong=1' should only be set for a single reader when multiple readers +are listening to the same address. .TP -.BI overwrite \fR=\fPbool -If writing, setup the file first and do overwrites. Default: false. +.BI (netsplice,net)window_size \fR=\fPint +Set the desired socket buffer size for the connection. .TP -.BI end_fsync \fR=\fPbool -Sync file contents when a write stage has completed. Default: false. +.BI (netsplice,net)mss \fR=\fPint +Set the TCP maximum segment size (TCP_MAXSEG). .TP -.BI fsync_on_close \fR=\fPbool -If true, sync file contents on close. This differs from \fBend_fsync\fR in that -it will happen on every close, not just at the end of the job. Default: false. +.BI (e4defrag)donorname \fR=\fPstr +File will be used as a block donor (swap extents between files). .TP -.BI rwmixread \fR=\fPint -Percentage of a mixed workload that should be reads. Default: 50. +.BI (e4defrag)inplace \fR=\fPint +Configure donor file blocks allocation strategy: +.RS +.RS .TP -.BI rwmixwrite \fR=\fPint -Percentage of a mixed workload that should be writes. If \fBrwmixread\fR and -\fBrwmixwrite\fR are given and do not sum to 100%, the latter of the two -overrides the first. This may interfere with a given rate setting, if fio is -asked to limit reads or writes to a certain rate. If that is the case, then -the distribution may be skewed. Default: 50. +.B 0 +Default. Preallocate donor's file on init. .TP -.BI random_distribution \fR=\fPstr:float -By default, fio will use a completely uniform random distribution when asked -to perform random IO. Sometimes it is useful to skew the distribution in -specific ways, ensuring that some parts of the data is more hot than others. -Fio includes the following distribution models: -.RS +.B 1 +Allocate space immediately inside defragment event, and free right +after event. +.RE +.RE .TP -.B random -Uniform random distribution +.BI (rbd)clustername \fR=\fPstr +Specifies the name of the Ceph cluster. .TP -.B zipf -Zipf distribution +.BI (rbd)rbdname \fR=\fPstr +Specifies the name of the RBD. .TP -.B pareto -Pareto distribution +.BI (rbd)pool \fR=\fPstr +Specifies the name of the Ceph pool containing RBD. .TP -.B gauss -Normal (gaussian) distribution +.BI (rbd)clientname \fR=\fPstr +Specifies the username (without the 'client.' prefix) used to access the +Ceph cluster. If the \fBclustername\fR is specified, the \fBclientname\fR shall be +the full *type.id* string. If no type. prefix is given, fio will add 'client.' +by default. .TP -.B zoned -Zoned random distribution +.BI (mtd)skip_bad \fR=\fPbool +Skip operations against known bad blocks. .TP -.RE -When using a \fBzipf\fR or \fBpareto\fR distribution, an input value is also -needed to define the access pattern. For \fBzipf\fR, this is the zipf theta. -For \fBpareto\fR, it's the pareto power. Fio includes a test program, genzipf, -that can be used visualize what the given input values will yield in terms of -hit rates. If you wanted to use \fBzipf\fR with a theta of 1.2, you would use -random_distribution=zipf:1.2 as the option. If a non-uniform model is used, -fio will disable use of the random map. For the \fBgauss\fR distribution, a -normal deviation is supplied as a value between 0 and 100. -.P -.RS -For a \fBzoned\fR distribution, fio supports specifying percentages of IO -access that should fall within what range of the file or device. For example, -given a criteria of: -.P -.RS -60% of accesses should be to the first 10% -.RE +.BI (libhdfs)hdfsdirectory +libhdfs will create chunk in this HDFS directory. +.TP +.BI (libhdfs)chunk_size +The size of the chunk to use for each file. +.SS "I/O depth" +.TP +.BI iodepth \fR=\fPint +Number of I/O units to keep in flight against the file. Note that +increasing \fBiodepth\fR beyond 1 will not affect synchronous ioengines (except +for small degrees when \fBverify_async\fR is in use). Even async +engines may impose OS restrictions causing the desired depth not to be +achieved. This may happen on Linux when using libaio and not setting +`direct=1', since buffered I/O is not async on that OS. Keep an +eye on the I/O depth distribution in the fio output to verify that the +achieved depth is as expected. Default: 1. +.TP +.BI iodepth_batch_submit \fR=\fPint "\fR,\fP iodepth_batch" \fR=\fPint +This defines how many pieces of I/O to submit at once. It defaults to 1 +which means that we submit each I/O as soon as it is available, but can be +raised to submit bigger batches of I/O at the time. If it is set to 0 the +\fBiodepth\fR value will be used. +.TP +.BI iodepth_batch_complete_min \fR=\fPint "\fR,\fP iodepth_batch_complete" \fR=\fPint +This defines how many pieces of I/O to retrieve at once. It defaults to 1 +which means that we'll ask for a minimum of 1 I/O in the retrieval process +from the kernel. The I/O retrieval will go on until we hit the limit set by +\fBiodepth_low\fR. If this variable is set to 0, then fio will always +check for completed events before queuing more I/O. This helps reduce I/O +latency, at the cost of more retrieval system calls. +.TP +.BI iodepth_batch_complete_max \fR=\fPint +This defines maximum pieces of I/O to retrieve at once. This variable should +be used along with \fBiodepth_batch_complete_min\fR=\fIint\fR variable, +specifying the range of min and max amount of I/O which should be +retrieved. By default it is equal to \fBiodepth_batch_complete_min\fR +value. Example #1: .RS -30% of accesses should be to the next 20% -.RE .RS -8% of accesses should be to to the next 30% +.P +.PD 0 +iodepth_batch_complete_min=1 +.P +iodepth_batch_complete_max= +.PD .RE +.P +which means that we will retrieve at least 1 I/O and up to the whole +submitted queue depth. If none of I/O has been completed yet, we will wait. +Example #2: .RS -2% of accesses should be to the next 40% -.RE .P -we can define that through zoning of the random accesses. For the above -example, the user would do: +.PD 0 +iodepth_batch_complete_min=0 .P -.RS -.B random_distribution=zoned:60/10:30/20:8/30:2/40 +iodepth_batch_complete_max= +.PD .RE .P -similarly to how \fBbssplit\fR works for setting ranges and percentages of block -sizes. Like \fBbssplit\fR, it's possible to specify separate zones for reads, -writes, and trims. If just one set is given, it'll apply to all of them. +which means that we can retrieve up to the whole submitted queue depth, but +if none of I/O has been completed yet, we will NOT wait and immediately exit +the system call. In this example we simply do polling. .RE .TP -.BI percentage_random \fR=\fPint -For a random workload, set how big a percentage should be random. This defaults -to 100%, in which case the workload is fully random. It can be set from -anywhere from 0 to 100. Setting it to 0 would make the workload fully -sequential. It is possible to set different values for reads, writes, and -trim. To do so, simply use a comma separated list. See \fBblocksize\fR. +.BI iodepth_low \fR=\fPint +The low water mark indicating when to start filling the queue +again. Defaults to the same as \fBiodepth\fR, meaning that fio will +attempt to keep the queue full at all times. If \fBiodepth\fR is set to +e.g. 16 and \fBiodepth_low\fR is set to 4, then after fio has filled the queue of +16 requests, it will let the depth drain down to 4 before starting to fill +it again. +.TP +.BI serialize_overlap \fR=\fPbool +Serialize in-flight I/Os that might otherwise cause or suffer from data races. +When two or more I/Os are submitted simultaneously, there is no guarantee that +the I/Os will be processed or completed in the submitted order. Further, if +two or more of those I/Os are writes, any overlapping region between them can +become indeterminate/undefined on certain storage. These issues can cause +verification to fail erratically when at least one of the racing I/Os is +changing data and the overlapping region has a non-zero size. Setting +\fBserialize_overlap\fR tells fio to avoid provoking this behavior by explicitly +serializing in-flight I/Os that have a non-zero overlap. Note that setting +this option can reduce both performance and the \fBiodepth\fR achieved. +Additionally this option does not work when \fBio_submit_mode\fR is set to +offload. Default: false. .TP -.B norandommap -Normally \fBfio\fR will cover every block of the file when doing random I/O. If -this parameter is given, a new offset will be chosen without looking at past -I/O history. This parameter is mutually exclusive with \fBverify\fR. +.BI io_submit_mode \fR=\fPstr +This option controls how fio submits the I/O to the I/O engine. The default +is `inline', which means that the fio job threads submit and reap I/O +directly. If set to `offload', the job threads will offload I/O submission +to a dedicated pool of I/O threads. This requires some coordination and thus +has a bit of extra overhead, especially for lower queue depth I/O where it +can increase latencies. The benefit is that fio can manage submission rates +independently of the device completion rates. This avoids skewed latency +reporting if I/O gets backed up on the device side (the coordinated omission +problem). +.SS "I/O rate" .TP -.BI softrandommap \fR=\fPbool -See \fBnorandommap\fR. If fio runs with the random block map enabled and it -fails to allocate the map, if this option is set it will continue without a -random block map. As coverage will not be as complete as with random maps, this -option is disabled by default. +.BI thinktime \fR=\fPtime +Stall the job for the specified period of time after an I/O has completed before issuing the +next. May be used to simulate processing being done by an application. +When the unit is omitted, the value is interpreted in microseconds. See +\fBthinktime_blocks\fR and \fBthinktime_spin\fR. +.TP +.BI thinktime_spin \fR=\fPtime +Only valid if \fBthinktime\fR is set \- pretend to spend CPU time doing +something with the data received, before falling back to sleeping for the +rest of the period specified by \fBthinktime\fR. When the unit is +omitted, the value is interpreted in microseconds. .TP -.BI random_generator \fR=\fPstr -Fio supports the following engines for generating IO offsets for random IO: -.RS +.BI thinktime_blocks \fR=\fPint +Only valid if \fBthinktime\fR is set \- control how many blocks to issue, +before waiting \fBthinktime\fR usecs. If not set, defaults to 1 which will make +fio wait \fBthinktime\fR usecs after every block. This effectively makes any +queue depth setting redundant, since no more than 1 I/O will be queued +before we have to complete it and do our \fBthinktime\fR. In other words, this +setting effectively caps the queue depth if the latter is larger. +.TP +.BI rate \fR=\fPint[,int][,int] +Cap the bandwidth used by this job. The number is in bytes/sec, the normal +suffix rules apply. Comma\-separated values may be specified for reads, +writes, and trims as described in \fBblocksize\fR. +.RS +.P +For example, using `rate=1m,500k' would limit reads to 1MiB/sec and writes to +500KiB/sec. Capping only reads or writes can be done with `rate=,500k' or +`rate=500k,' where the former will only limit writes (to 500KiB/sec) and the +latter will only limit reads. +.RE +.TP +.BI rate_min \fR=\fPint[,int][,int] +Tell fio to do whatever it can to maintain at least this bandwidth. Failing +to meet this requirement will cause the job to exit. Comma\-separated values +may be specified for reads, writes, and trims as described in +\fBblocksize\fR. +.TP +.BI rate_iops \fR=\fPint[,int][,int] +Cap the bandwidth to this number of IOPS. Basically the same as +\fBrate\fR, just specified independently of bandwidth. If the job is +given a block size range instead of a fixed value, the smallest block size +is used as the metric. Comma\-separated values may be specified for reads, +writes, and trims as described in \fBblocksize\fR. +.TP +.BI rate_iops_min \fR=\fPint[,int][,int] +If fio doesn't meet this rate of I/O, it will cause the job to exit. +Comma\-separated values may be specified for reads, writes, and trims as +described in \fBblocksize\fR. .TP -.B tausworthe -Strong 2^88 cycle random number generator +.BI rate_process \fR=\fPstr +This option controls how fio manages rated I/O submissions. The default is +`linear', which submits I/O in a linear fashion with fixed delays between +I/Os that gets adjusted based on I/O completion rates. If this is set to +`poisson', fio will submit I/O based on a more real world random request +flow, known as the Poisson process +(\fIhttps://en.wikipedia.org/wiki/Poisson_point_process\fR). The lambda will be +10^6 / IOPS for the given workload. +.SS "I/O latency" .TP -.B lfsr -Linear feedback shift register generator +.BI latency_target \fR=\fPtime +If set, fio will attempt to find the max performance point that the given +workload will run at while maintaining a latency below this target. When +the unit is omitted, the value is interpreted in microseconds. See +\fBlatency_window\fR and \fBlatency_percentile\fR. .TP -.B tausworthe64 -Strong 64-bit 2^258 cycle random number generator +.BI latency_window \fR=\fPtime +Used with \fBlatency_target\fR to specify the sample window that the job +is run at varying queue depths to test the performance. When the unit is +omitted, the value is interpreted in microseconds. .TP -.RE -.P -Tausworthe is a strong random number generator, but it requires tracking on the -side if we want to ensure that blocks are only read or written once. LFSR -guarantees that we never generate the same offset twice, and it's also less -computationally expensive. It's not a true random generator, however, though -for IO purposes it's typically good enough. LFSR only works with single block -sizes, not with workloads that use multiple block sizes. If used with such a -workload, fio may read or write some blocks multiple times. The default -value is tausworthe, unless the required space exceeds 2^32 blocks. If it does, -then tausworthe64 is selected automatically. +.BI latency_percentile \fR=\fPfloat +The percentage of I/Os that must fall within the criteria specified by +\fBlatency_target\fR and \fBlatency_window\fR. If not set, this +defaults to 100.0, meaning that all I/Os must be equal or below to the value +set by \fBlatency_target\fR. +.TP +.BI max_latency \fR=\fPtime +If set, fio will exit the job with an ETIMEDOUT error if it exceeds this +maximum latency. When the unit is omitted, the value is interpreted in +microseconds. .TP -.BI nice \fR=\fPint -Run job with given nice value. See \fBnice\fR\|(2). +.BI rate_cycle \fR=\fPint +Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number +of milliseconds. Defaults to 1000. +.SS "I/O replay" .TP -.BI prio \fR=\fPint -Set I/O priority value of this job between 0 (highest) and 7 (lowest). See -\fBionice\fR\|(1). +.BI write_iolog \fR=\fPstr +Write the issued I/O patterns to the specified file. See +\fBread_iolog\fR. Specify a separate file for each job, otherwise the +iologs will be interspersed and the file may be corrupt. .TP -.BI prioclass \fR=\fPint -Set I/O priority class. See \fBionice\fR\|(1). +.BI read_iolog \fR=\fPstr +Open an iolog with the specified filename and replay the I/O patterns it +contains. This can be used to store a workload and replay it sometime +later. The iolog given may also be a blktrace binary file, which allows fio +to replay a workload captured by blktrace. See +\fBblktrace\fR\|(8) for how to capture such logging data. For blktrace +replay, the file needs to be turned into a blkparse binary data file first +(`blkparse \-o /dev/null \-d file_for_fio.bin'). +.TP +.BI replay_no_stall \fR=\fPbool +When replaying I/O with \fBread_iolog\fR the default behavior is to +attempt to respect the timestamps within the log and replay them with the +appropriate delay between IOPS. By setting this variable fio will not +respect the timestamps and attempt to replay them as fast as possible while +still respecting ordering. The result is the same I/O pattern to a given +device, but different timings. .TP -.BI thinktime \fR=\fPint -Stall job for given number of microseconds between issuing I/Os. +.BI replay_redirect \fR=\fPstr +While replaying I/O patterns using \fBread_iolog\fR the default behavior +is to replay the IOPS onto the major/minor device that each IOP was recorded +from. This is sometimes undesirable because on a different machine those +major/minor numbers can map to a different device. Changing hardware on the +same system can also result in a different major/minor mapping. +\fBreplay_redirect\fR causes all I/Os to be replayed onto the single specified +device regardless of the device it was recorded +from. i.e. `replay_redirect=/dev/sdc' would cause all I/O +in the blktrace or iolog to be replayed onto `/dev/sdc'. This means +multiple devices will be replayed onto a single device, if the trace +contains multiple devices. If you want multiple devices to be replayed +concurrently to multiple redirected devices you must blkparse your trace +into separate traces and replay them with independent fio invocations. +Unfortunately this also breaks the strict time ordering between multiple +device accesses. .TP -.BI thinktime_spin \fR=\fPint -Pretend to spend CPU time for given number of microseconds, sleeping the rest -of the time specified by \fBthinktime\fR. Only valid if \fBthinktime\fR is set. +.BI replay_align \fR=\fPint +Force alignment of I/O offsets and lengths in a trace to this power of 2 +value. .TP -.BI thinktime_blocks \fR=\fPint -Only valid if thinktime is set - control how many blocks to issue, before -waiting \fBthinktime\fR microseconds. If not set, defaults to 1 which will -make fio wait \fBthinktime\fR microseconds after every block. This -effectively makes any queue depth setting redundant, since no more than 1 IO -will be queued before we have to complete it and do our thinktime. In other -words, this setting effectively caps the queue depth if the latter is larger. -Default: 1. -.TP -.BI rate \fR=\fPint -Cap bandwidth used by this job. The number is in bytes/sec, the normal postfix -rules apply. You can use \fBrate\fR=500k to limit reads and writes to 500k each, -or you can specify read and writes separately. Using \fBrate\fR=1m,500k would -limit reads to 1MB/sec and writes to 500KB/sec. Capping only reads or writes -can be done with \fBrate\fR=,500k or \fBrate\fR=500k,. The former will only -limit writes (to 500KB/sec), the latter will only limit reads. -.TP -.BI rate_min \fR=\fPint -Tell \fBfio\fR to do whatever it can to maintain at least the given bandwidth. -Failing to meet this requirement will cause the job to exit. The same format -as \fBrate\fR is used for read vs write separation. -.TP -.BI rate_iops \fR=\fPint -Cap the bandwidth to this number of IOPS. Basically the same as rate, just -specified independently of bandwidth. The same format as \fBrate\fR is used for -read vs write separation. If \fBblocksize\fR is a range, the smallest block -size is used as the metric. -.TP -.BI rate_iops_min \fR=\fPint -If this rate of I/O is not met, the job will exit. The same format as \fBrate\fR -is used for read vs write separation. +.BI replay_scale \fR=\fPint +Scale sector offsets down by this factor when replaying traces. +.SS "Threads, processes and job synchronization" .TP -.BI rate_process \fR=\fPstr -This option controls how fio manages rated IO submissions. The default is -\fBlinear\fR, which submits IO in a linear fashion with fixed delays between -IOs that gets adjusted based on IO completion rates. If this is set to -\fBpoisson\fR, fio will submit IO based on a more real world random request -flow, known as the Poisson process -(https://en.wikipedia.org/wiki/Poisson_process). The lambda will be -10^6 / IOPS for the given workload. +.BI thread +Fio defaults to creating jobs by using fork, however if this option is +given, fio will create jobs by using POSIX Threads' function +\fBpthread_create\fR\|(3) to create threads instead. .TP -.BI rate_cycle \fR=\fPint -Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number of -milliseconds. Default: 1000ms. +.BI wait_for \fR=\fPstr +If set, the current job won't be started until all workers of the specified +waitee job are done. +.\" ignore blank line here from HOWTO as it looks normal without it +\fBwait_for\fR operates on the job name basis, so there are a few +limitations. First, the waitee must be defined prior to the waiter job +(meaning no forward references). Second, if a job is being referenced as a +waitee, it must have a unique name (no duplicate waitees). .TP -.BI latency_target \fR=\fPint -If set, fio will attempt to find the max performance point that the given -workload will run at while maintaining a latency below this target. The -values is given in microseconds. See \fBlatency_window\fR and -\fBlatency_percentile\fR. +.BI nice \fR=\fPint +Run the job with the given nice value. See man \fBnice\fR\|(2). +.\" ignore blank line here from HOWTO as it looks normal without it +On Windows, values less than \-15 set the process class to "High"; \-1 through +\-15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle" +priority class. .TP -.BI latency_window \fR=\fPint -Used with \fBlatency_target\fR to specify the sample window that the job -is run at varying queue depths to test the performance. The value is given -in microseconds. +.BI prio \fR=\fPint +Set the I/O priority value of this job. Linux limits us to a positive value +between 0 and 7, with 0 being the highest. See man +\fBionice\fR\|(1). Refer to an appropriate manpage for other operating +systems since meaning of priority may differ. .TP -.BI latency_percentile \fR=\fPfloat -The percentage of IOs that must fall within the criteria specified by -\fBlatency_target\fR and \fBlatency_window\fR. If not set, this defaults -to 100.0, meaning that all IOs must be equal or below to the value set -by \fBlatency_target\fR. -.TP -.BI max_latency \fR=\fPint -If set, fio will exit the job if it exceeds this maximum latency. It will exit -with an ETIME error. +.BI prioclass \fR=\fPint +Set the I/O priority class. See man \fBionice\fR\|(1). .TP .BI cpumask \fR=\fPint -Set CPU affinity for this job. \fIint\fR is a bitmask of allowed CPUs the job -may run on. See \fBsched_setaffinity\fR\|(2). +Set the CPU affinity of this job. The parameter given is a bit mask of +allowed CPUs the job may run on. So if you want the allowed CPUs to be 1 +and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man +\fBsched_setaffinity\fR\|(2). This may not work on all supported +operating systems or kernel versions. This option doesn't work well for a +higher CPU count than what you can store in an integer mask, so it can only +control cpus 1\-32. For boxes with larger CPU counts, use +\fBcpus_allowed\fR. .TP .BI cpus_allowed \fR=\fPstr -Same as \fBcpumask\fR, but allows a comma-delimited list of CPU numbers. +Controls the same options as \fBcpumask\fR, but accepts a textual +specification of the permitted CPUs instead. So to use CPUs 1 and 5 you +would specify `cpus_allowed=1,5'. This option also allows a range of CPUs +to be specified \-\- say you wanted a binding to CPUs 1, 5, and 8 to 15, you +would set `cpus_allowed=1,5,8\-15'. .TP .BI cpus_allowed_policy \fR=\fPstr -Set the policy of how fio distributes the CPUs specified by \fBcpus_allowed\fR -or \fBcpumask\fR. Two policies are supported: +Set the policy of how fio distributes the CPUs specified by +\fBcpus_allowed\fR or \fBcpumask\fR. Two policies are supported: .RS .RS .TP @@ -1109,827 +2010,711 @@ Each job will get a unique CPU from the CPU set. .RE .P -\fBshared\fR is the default behaviour, if the option isn't specified. If -\fBsplit\fR is specified, then fio will assign one cpu per job. If not enough -CPUs are given for the jobs listed, then fio will roundrobin the CPUs in -the set. +\fBshared\fR is the default behavior, if the option isn't specified. If +\fBsplit\fR is specified, then fio will will assign one cpu per job. If not +enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs +in the set. .RE -.P .TP .BI numa_cpu_nodes \fR=\fPstr Set this job running on specified NUMA nodes' CPUs. The arguments allow -comma delimited list of cpu numbers, A-B ranges, or 'all'. +comma delimited list of cpu numbers, A\-B ranges, or `all'. Note, to enable +NUMA options support, fio must be built on a system with libnuma\-dev(el) +installed. .TP .BI numa_mem_policy \fR=\fPstr -Set this job's memory policy and corresponding NUMA nodes. Format of -the arguments: +Set this job's memory policy and corresponding NUMA nodes. Format of the +arguments: .RS -.TP -.B [:] -.TP -.B mode -is one of the following memory policy: -.TP -.B default, prefer, bind, interleave, local -.TP +.RS +.P +[:] +.RE +.P +`mode' is one of the following memory poicies: `default', `prefer', +`bind', `interleave' or `local'. For `default' and `local' memory +policies, no node needs to be specified. For `prefer', only one node is +allowed. For `bind' and `interleave' the `nodelist' may be as +follows: a comma delimited list of numbers, A\-B ranges, or `all'. .RE -For \fBdefault\fR and \fBlocal\fR memory policy, no \fBnodelist\fR is -needed to be specified. For \fBprefer\fR, only one node is -allowed. For \fBbind\fR and \fBinterleave\fR, \fBnodelist\fR allows -comma delimited list of numbers, A-B ranges, or 'all'. -.TP -.BI startdelay \fR=\fPirange -Delay start of job for the specified number of seconds. Supports all time -suffixes to allow specification of hours, minutes, seconds and -milliseconds - seconds are the default if a unit is omitted. -Can be given as a range which causes each thread to choose randomly out of the -range. -.TP -.BI runtime \fR=\fPint -Terminate processing after the specified number of seconds. -.TP -.B time_based -If given, run for the specified \fBruntime\fR duration even if the files are -completely read or written. The same workload will be repeated as many times -as \fBruntime\fR allows. -.TP -.BI ramp_time \fR=\fPint -If set, fio will run the specified workload for this amount of time before -logging any performance numbers. Useful for letting performance settle before -logging results, thus minimizing the runtime required for stable results. Note -that the \fBramp_time\fR is considered lead in time for a job, thus it will -increase the total runtime if a special timeout or runtime is specified. .TP -.BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float -Define the criterion and limit for assessing steady state performance. The -first parameter designates the criterion whereas the second parameter sets the -threshold. When the criterion falls below the threshold for the specified -duration, the job will stop. For example, iops_slope:0.1% will direct fio -to terminate the job when the least squares regression slope falls below 0.1% -of the mean IOPS. If group_reporting is enabled this will apply to all jobs in -the group. All assessments are carried out using only data from the rolling -collection window. Threshold limits can be expressed as a fixed value or as a -percentage of the mean in the collection window. Below are the available steady -state assessment criteria. +.BI cgroup \fR=\fPstr +Add job to this control group. If it doesn't exist, it will be created. The +system must have a mounted cgroup blkio mount point for this to work. If +your system doesn't have it mounted, you can do so with: .RS .RS -.TP -.B iops -Collect IOPS data. Stop the job if all individual IOPS measurements are within -the specified limit of the mean IOPS (e.g., iops:2 means that all individual -IOPS values must be within 2 of the mean, whereas iops:0.2% means that all -individual IOPS values must be within 0.2% of the mean IOPS to terminate the -job). -.TP -.B iops_slope -Collect IOPS data and calculate the least squares regression slope. Stop the -job if the slope falls below the specified limit. -.TP -.B bw -Collect bandwidth data. Stop the job if all individual bandwidth measurements -are within the specified limit of the mean bandwidth. -.TP -.B bw_slope -Collect bandwidth data and calculate the least squares regression slope. Stop -the job if the slope falls below the specified limit. +.P +# mount \-t cgroup \-o blkio none /cgroup .RE .RE .TP -.BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime -A rolling window of this duration will be used to judge whether steady state -has been reached. Data will be collected once per second. The default is 0 -which disables steady state detection. -.TP -.BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime -Allow the job to run for the specified duration before beginning data collection -for checking the steady state job termination criterion. The default is 0. -.TP -.BI invalidate \fR=\fPbool -Invalidate buffer-cache for the file prior to starting I/O. Default: true. +.BI cgroup_weight \fR=\fPint +Set the weight of the cgroup to this value. See the documentation that comes +with the kernel, allowed values are in the range of 100..1000. .TP -.BI sync \fR=\fPbool -Use synchronous I/O for buffered writes. For the majority of I/O engines, -this means using O_SYNC. Default: false. +.BI cgroup_nodelete \fR=\fPbool +Normally fio will delete the cgroups it has created after the job +completion. To override this behavior and to leave cgroups around after the +job completion, set `cgroup_nodelete=1'. This can be useful if one wants +to inspect various cgroup files after job completion. Default: false. .TP -.BI iomem \fR=\fPstr "\fR,\fP mem" \fR=\fPstr -Allocation method for I/O unit buffer. Allowed values are: -.RS -.RS +.BI flow_id \fR=\fPint +The ID of the flow. If not specified, it defaults to being a global +flow. See \fBflow\fR. .TP -.B malloc -Allocate memory with \fBmalloc\fR\|(3). Default memory type. +.BI flow \fR=\fPint +Weight in token\-based flow control. If this value is used, then there is +a 'flow counter' which is used to regulate the proportion of activity between +two or more jobs. Fio attempts to keep this flow counter near zero. The +\fBflow\fR parameter stands for how much should be added or subtracted to the +flow counter on each iteration of the main I/O loop. That is, if one job has +`flow=8' and another job has `flow=\-1', then there will be a roughly 1:8 +ratio in how much one runs vs the other. .TP -.B shm -Use shared memory buffers allocated through \fBshmget\fR\|(2). +.BI flow_watermark \fR=\fPint +The maximum value that the absolute value of the flow counter is allowed to +reach before the job must wait for a lower value of the counter. .TP -.B shmhuge -Same as \fBshm\fR, but use huge pages as backing. +.BI flow_sleep \fR=\fPint +The period of time, in microseconds, to wait after the flow watermark has +been exceeded before retrying operations. .TP -.B mmap -Use \fBmmap\fR\|(2) for allocation. Uses anonymous memory unless a filename -is given after the option in the format `:\fIfile\fR'. +.BI stonewall "\fR,\fB wait_for_previous" +Wait for preceding jobs in the job file to exit, before starting this +one. Can be used to insert serialization points in the job file. A stone +wall also implies starting a new reporting group, see +\fBgroup_reporting\fR. +.TP +.BI exitall +By default, fio will continue running all other jobs when one job finishes +but sometimes this is not the desired action. Setting \fBexitall\fR will +instead make fio terminate all other jobs when one job finishes. .TP -.B mmaphuge -Same as \fBmmap\fR, but use huge files as backing. +.BI exec_prerun \fR=\fPstr +Before running this job, issue the command specified through +\fBsystem\fR\|(3). Output is redirected in a file called `jobname.prerun.txt'. .TP -.B mmapshared -Same as \fBmmap\fR, but use a MMAP_SHARED mapping. -.RE -.P -The amount of memory allocated is the maximum allowed \fBblocksize\fR for the -job multiplied by \fBiodepth\fR. For \fBshmhuge\fR or \fBmmaphuge\fR to work, -the system must have free huge pages allocated. \fBmmaphuge\fR also needs to -have hugetlbfs mounted, and \fIfile\fR must point there. At least on Linux, -huge pages must be manually allocated. See \fB/proc/sys/vm/nr_hugehages\fR -and the documentation for that. Normally you just need to echo an appropriate -number, eg echoing 8 will ensure that the OS has 8 huge pages ready for -use. -.RE +.BI exec_postrun \fR=\fPstr +After the job completes, issue the command specified though +\fBsystem\fR\|(3). Output is redirected in a file called `jobname.postrun.txt'. .TP -.BI iomem_align \fR=\fPint "\fR,\fP mem_align" \fR=\fPint -This indicates the memory alignment of the IO memory buffers. Note that the -given alignment is applied to the first IO unit buffer, if using \fBiodepth\fR -the alignment of the following buffers are given by the \fBbs\fR used. In -other words, if using a \fBbs\fR that is a multiple of the page sized in the -system, all buffers will be aligned to this value. If using a \fBbs\fR that -is not page aligned, the alignment of subsequent IO memory buffers is the -sum of the \fBiomem_align\fR and \fBbs\fR used. +.BI uid \fR=\fPint +Instead of running as the invoking user, set the user ID to this value +before the thread/process does any work. .TP -.BI hugepage\-size \fR=\fPint -Defines the size of a huge page. Must be at least equal to the system setting. -Should be a multiple of 1MB. Default: 4MB. +.BI gid \fR=\fPint +Set group ID, see \fBuid\fR. +.SS "Verification" .TP -.B exitall -Terminate all jobs when one finishes. Default: wait for each job to finish. +.BI verify_only +Do not perform specified workload, only verify data still matches previous +invocation of this workload. This option allows one to check data multiple +times at a later date without overwriting it. This option makes sense only +for workloads that write data, and does not support workloads with the +\fBtime_based\fR option set. .TP -.B exitall_on_error \fR=\fPbool -Terminate all jobs if one job finishes in error. Default: wait for each job -to finish. +.BI do_verify \fR=\fPbool +Run the verify phase after a write phase. Only valid if \fBverify\fR is +set. Default: true. .TP -.BI bwavgtime \fR=\fPint -Average bandwidth calculations over the given time in milliseconds. If the job -also does bandwidth logging through \fBwrite_bw_log\fR, then the minimum of -this option and \fBlog_avg_msec\fR will be used. Default: 500ms. +.BI verify \fR=\fPstr +If writing to a file, fio can verify the file contents after each iteration +of the job. Each verification method also implies verification of special +header, which is written to the beginning of each block. This header also +includes meta information, like offset of the block, block number, timestamp +when block was written, etc. \fBverify\fR can be combined with +\fBverify_pattern\fR option. The allowed values are: +.RS +.RS .TP -.BI iopsavgtime \fR=\fPint -Average IOPS calculations over the given time in milliseconds. If the job -also does IOPS logging through \fBwrite_iops_log\fR, then the minimum of -this option and \fBlog_avg_msec\fR will be used. Default: 500ms. +.B md5 +Use an md5 sum of the data area and store it in the header of +each block. .TP -.BI create_serialize \fR=\fPbool -If true, serialize file creation for the jobs. Default: true. +.B crc64 +Use an experimental crc64 sum of the data area and store it in the +header of each block. .TP -.BI create_fsync \fR=\fPbool -\fBfsync\fR\|(2) data file after creation. Default: true. +.B crc32c +Use a crc32c sum of the data area and store it in the header of +each block. This will automatically use hardware acceleration +(e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will +fall back to software crc32c if none is found. Generally the +fatest checksum fio supports when hardware accelerated. .TP -.BI create_on_open \fR=\fPbool -If true, the files are not created until they are opened for IO by the job. +.B crc32c\-intel +Synonym for crc32c. .TP -.BI create_only \fR=\fPbool -If true, fio will only run the setup phase of the job. If files need to be -laid out or updated on disk, only that will be done. The actual job contents -are not executed. +.B crc32 +Use a crc32 sum of the data area and store it in the header of each +block. .TP -.BI allow_file_create \fR=\fPbool -If true, fio is permitted to create files as part of its workload. This is -the default behavior. If this option is false, then fio will error out if the -files it needs to use don't already exist. Default: true. +.B crc16 +Use a crc16 sum of the data area and store it in the header of each +block. .TP -.BI allow_mounted_write \fR=\fPbool -If this isn't set, fio will abort jobs that are destructive (eg that write) -to what appears to be a mounted device or partition. This should help catch -creating inadvertently destructive tests, not realizing that the test will -destroy data on the mounted file system. Default: false. +.B crc7 +Use a crc7 sum of the data area and store it in the header of each +block. .TP -.BI pre_read \fR=\fPbool -If this is given, files will be pre-read into memory before starting the given -IO operation. This will also clear the \fR \fBinvalidate\fR flag, since it is -pointless to pre-read and then drop the cache. This will only work for IO -engines that are seekable, since they allow you to read the same data -multiple times. Thus it will not work on eg network or splice IO. +.B xxhash +Use xxhash as the checksum function. Generally the fastest software +checksum that fio supports. .TP -.BI unlink \fR=\fPbool -Unlink job files when done. Default: false. +.B sha512 +Use sha512 as the checksum function. .TP -.BI unlink_each_loop \fR=\fPbool -Unlink job files after each iteration or loop. Default: false. +.B sha256 +Use sha256 as the checksum function. .TP -.BI loops \fR=\fPint -Specifies the number of iterations (runs of the same workload) of this job. -Default: 1. +.B sha1 +Use optimized sha1 as the checksum function. .TP -.BI verify_only \fR=\fPbool -Do not perform the specified workload, only verify data still matches previous -invocation of this workload. This option allows one to check data multiple -times at a later date without overwriting it. This option makes sense only for -workloads that write data, and does not support workloads with the -\fBtime_based\fR option set. +.B sha3\-224 +Use optimized sha3\-224 as the checksum function. .TP -.BI do_verify \fR=\fPbool -Run the verify phase after a write phase. Only valid if \fBverify\fR is set. -Default: true. +.B sha3\-256 +Use optimized sha3\-256 as the checksum function. .TP -.BI verify \fR=\fPstr -Method of verifying file contents after each iteration of the job. Each -verification method also implies verification of special header, which is -written to the beginning of each block. This header also includes meta -information, like offset of the block, block number, timestamp when block -was written, etc. \fBverify\fR=str can be combined with \fBverify_pattern\fR=str -option. The allowed values are: -.RS -.RS +.B sha3\-384 +Use optimized sha3\-384 as the checksum function. .TP -.B md5 crc16 crc32 crc32c crc32c-intel crc64 crc7 sha256 sha512 sha1 xxhash -Store appropriate checksum in the header of each block. crc32c-intel is -hardware accelerated SSE4.2 driven, falls back to regular crc32c if -not supported by the system. +.B sha3\-512 +Use optimized sha3\-512 as the checksum function. .TP .B meta -This option is deprecated, since now meta information is included in generic -verification header and meta verification happens by default. For detailed -information see the description of the \fBverify\fR=str setting. This option -is kept because of compatibility's sake with old configurations. Do not use it. +This option is deprecated, since now meta information is included in +generic verification header and meta verification happens by +default. For detailed information see the description of the +\fBverify\fR setting. This option is kept because of +compatibility's sake with old configurations. Do not use it. .TP .B pattern -Verify a strict pattern. Normally fio includes a header with some basic -information and checksumming, but if this option is set, only the -specific pattern set with \fBverify_pattern\fR is verified. +Verify a strict pattern. Normally fio includes a header with some +basic information and checksumming, but if this option is set, only +the specific pattern set with \fBverify_pattern\fR is verified. .TP .B null -Pretend to verify. Used for testing internals. +Only pretend to verify. Useful for testing internals with +`ioengine=null', not for much else. .RE - -This option can be used for repeated burn-in tests of a system to make sure -that the written data is also correctly read back. If the data direction given -is a read or random read, fio will assume that it should verify a previously -written file. If the data direction includes any form of write, the verify will -be of the newly written data. +.P +This option can be used for repeated burn\-in tests of a system to make sure +that the written data is also correctly read back. If the data direction +given is a read or random read, fio will assume that it should verify a +previously written file. If the data direction includes any form of write, +the verify will be of the newly written data. .RE .TP .BI verifysort \fR=\fPbool -If true, written verify blocks are sorted if \fBfio\fR deems it to be faster to -read them back in a sorted manner. Default: true. +If true, fio will sort written verify blocks when it deems it faster to read +them back in a sorted manner. This is often the case when overwriting an +existing file, since the blocks are already laid out in the file system. You +can ignore this option unless doing huge amounts of really fast I/O where +the red\-black tree sorting CPU time becomes significant. Default: true. .TP .BI verifysort_nr \fR=\fPint -Pre-load and sort verify blocks for a read workload. +Pre\-load and sort verify blocks for a read workload. .TP .BI verify_offset \fR=\fPint Swap the verification header with data somewhere else in the block before -writing. It is swapped back before verifying. +writing. It is swapped back before verifying. .TP .BI verify_interval \fR=\fPint -Write the verification header for this number of bytes, which should divide -\fBblocksize\fR. Default: \fBblocksize\fR. +Write the verification header at a finer granularity than the +\fBblocksize\fR. It will be written for chunks the size of +\fBverify_interval\fR. \fBblocksize\fR should divide this evenly. .TP .BI verify_pattern \fR=\fPstr -If set, fio will fill the io buffers with this pattern. Fio defaults to filling -with totally random bytes, but sometimes it's interesting to fill with a known -pattern for io verification purposes. Depending on the width of the pattern, -fio will fill 1/2/3/4 bytes of the buffer at the time(it can be either a -decimal or a hex number). The verify_pattern if larger than a 32-bit quantity -has to be a hex number that starts with either "0x" or "0X". Use with -\fBverify\fP=str. Also, verify_pattern supports %o format, which means that for -each block offset will be written and then verified back, e.g.: +If set, fio will fill the I/O buffers with this pattern. Fio defaults to +filling with totally random bytes, but sometimes it's interesting to fill +with a known pattern for I/O verification purposes. Depending on the width +of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can +be either a decimal or a hex number). The \fBverify_pattern\fR if larger than +a 32\-bit quantity has to be a hex number that starts with either "0x" or +"0X". Use with \fBverify\fR. Also, \fBverify_pattern\fR supports %o +format, which means that for each block offset will be written and then +verified back, e.g.: .RS .RS -\fBverify_pattern\fR=%o +.P +verify_pattern=%o .RE +.P Or use combination of everything: -.LP .RS -\fBverify_pattern\fR=0xff%o"abcd"-21 +.P +verify_pattern=0xff%o"abcd"\-12 .RE .RE .TP .BI verify_fatal \fR=\fPbool -If true, exit the job on the first observed verification failure. Default: -false. +Normally fio will keep checking the entire contents before quitting on a +block verification failure. If this option is set, fio will exit the job on +the first observed failure. Default: false. .TP .BI verify_dump \fR=\fPbool -If set, dump the contents of both the original data block and the data block we -read off disk to files. This allows later analysis to inspect just what kind of -data corruption occurred. Off by default. +If set, dump the contents of both the original data block and the data block +we read off disk to files. This allows later analysis to inspect just what +kind of data corruption occurred. Off by default. .TP .BI verify_async \fR=\fPint -Fio will normally verify IO inline from the submitting thread. This option -takes an integer describing how many async offload threads to create for IO -verification instead, causing fio to offload the duty of verifying IO contents -to one or more separate threads. If using this offload option, even sync IO -engines can benefit from using an \fBiodepth\fR setting higher than 1, as it -allows them to have IO in flight while verifies are running. +Fio will normally verify I/O inline from the submitting thread. This option +takes an integer describing how many async offload threads to create for I/O +verification instead, causing fio to offload the duty of verifying I/O +contents to one or more separate threads. If using this offload option, even +sync I/O engines can benefit from using an \fBiodepth\fR setting higher +than 1, as it allows them to have I/O in flight while verifies are running. +Defaults to 0 async threads, i.e. verification is not asynchronous. .TP .BI verify_async_cpus \fR=\fPstr -Tell fio to set the given CPU affinity on the async IO verification threads. -See \fBcpus_allowed\fP for the format used. +Tell fio to set the given CPU affinity on the async I/O verification +threads. See \fBcpus_allowed\fR for the format used. .TP .BI verify_backlog \fR=\fPint Fio will normally verify the written contents of a job that utilizes verify once that job has completed. In other words, everything is written then everything is read back and verified. You may want to verify continually -instead for a variety of reasons. Fio stores the meta data associated with an -IO block in memory, so for large verify workloads, quite a bit of memory would -be used up holding this meta data. If this option is enabled, fio will write -only N blocks before verifying these blocks. +instead for a variety of reasons. Fio stores the meta data associated with +an I/O block in memory, so for large verify workloads, quite a bit of memory +would be used up holding this meta data. If this option is enabled, fio will +write only N blocks before verifying these blocks. .TP .BI verify_backlog_batch \fR=\fPint -Control how many blocks fio will verify if verify_backlog is set. If not set, -will default to the value of \fBverify_backlog\fR (meaning the entire queue is -read back and verified). If \fBverify_backlog_batch\fR is less than -\fBverify_backlog\fR then not all blocks will be verified, if -\fBverify_backlog_batch\fR is larger than \fBverify_backlog\fR, some blocks -will be verified more than once. +Control how many blocks fio will verify if \fBverify_backlog\fR is +set. If not set, will default to the value of \fBverify_backlog\fR +(meaning the entire queue is read back and verified). If +\fBverify_backlog_batch\fR is less than \fBverify_backlog\fR then not all +blocks will be verified, if \fBverify_backlog_batch\fR is larger than +\fBverify_backlog\fR, some blocks will be verified more than once. +.TP +.BI verify_state_save \fR=\fPbool +When a job exits during the write phase of a verify workload, save its +current state. This allows fio to replay up until that point, if the verify +state is loaded for the verify read phase. The format of the filename is, +roughly: +.RS +.RS +.P +\-\-\-verify.state. +.RE +.P + is "local" for a local run, "sock" for a client/server socket +connection, and "ip" (192.168.0.1, for instance) for a networked +client/server connection. Defaults to true. +.RE +.TP +.BI verify_state_load \fR=\fPbool +If a verify termination trigger was used, fio stores the current write state +of each thread. This can be used at verification time so that fio knows how +far it should verify. Without this information, fio will run a full +verification pass, according to the settings in the job file used. Default +false. .TP .BI trim_percentage \fR=\fPint Number of verify blocks to discard/trim. .TP .BI trim_verify_zero \fR=\fPbool -Verify that trim/discarded blocks are returned as zeroes. +Verify that trim/discarded blocks are returned as zeros. .TP .BI trim_backlog \fR=\fPint -Trim after this number of blocks are written. +Verify that trim/discarded blocks are returned as zeros. .TP .BI trim_backlog_batch \fR=\fPint -Trim this number of IO blocks. +Trim this number of I/O blocks. .TP .BI experimental_verify \fR=\fPbool Enable experimental verification. +.SS "Steady state" .TP -.BI verify_state_save \fR=\fPbool -When a job exits during the write phase of a verify workload, save its -current state. This allows fio to replay up until that point, if the -verify state is loaded for the verify read phase. -.TP -.BI verify_state_load \fR=\fPbool -If a verify termination trigger was used, fio stores the current write -state of each thread. This can be used at verification time so that fio -knows how far it should verify. Without this information, fio will run -a full verification pass, according to the settings in the job file used. -.TP -.B stonewall "\fR,\fP wait_for_previous" -Wait for preceding jobs in the job file to exit before starting this one. -\fBstonewall\fR implies \fBnew_group\fR. -.TP -.B new_group -Start a new reporting group. If not given, all jobs in a file will be part -of the same reporting group, unless separated by a stonewall. -.TP -.BI numjobs \fR=\fPint -Number of clones (processes/threads performing the same workload) of this job. -Default: 1. -.TP -.B group_reporting -If set, display per-group reports instead of per-job when \fBnumjobs\fR is -specified. -.TP -.B thread -Use threads created with \fBpthread_create\fR\|(3) instead of processes created -with \fBfork\fR\|(2). -.TP -.BI zonesize \fR=\fPint -Divide file into zones of the specified size in bytes. See \fBzoneskip\fR. -.TP -.BI zonerange \fR=\fPint -Give size of an IO zone. See \fBzoneskip\fR. -.TP -.BI zoneskip \fR=\fPint -Skip the specified number of bytes when \fBzonesize\fR bytes of data have been -read. +.BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float +Define the criterion and limit for assessing steady state performance. The +first parameter designates the criterion whereas the second parameter sets +the threshold. When the criterion falls below the threshold for the +specified duration, the job will stop. For example, `iops_slope:0.1%' will +direct fio to terminate the job when the least squares regression slope +falls below 0.1% of the mean IOPS. If \fBgroup_reporting\fR is enabled +this will apply to all jobs in the group. Below is the list of available +steady state assessment criteria. All assessments are carried out using only +data from the rolling collection window. Threshold limits can be expressed +as a fixed value or as a percentage of the mean in the collection window. +.RS +.RS .TP -.BI write_iolog \fR=\fPstr -Write the issued I/O patterns to the specified file. Specify a separate file -for each job, otherwise the iologs will be interspersed and the file may be -corrupt. +.B iops +Collect IOPS data. Stop the job if all individual IOPS measurements +are within the specified limit of the mean IOPS (e.g., `iops:2' +means that all individual IOPS values must be within 2 of the mean, +whereas `iops:0.2%' means that all individual IOPS values must be +within 0.2% of the mean IOPS to terminate the job). .TP -.BI read_iolog \fR=\fPstr -Replay the I/O patterns contained in the specified file generated by -\fBwrite_iolog\fR, or may be a \fBblktrace\fR binary file. +.B iops_slope +Collect IOPS data and calculate the least squares regression +slope. Stop the job if the slope falls below the specified limit. .TP -.BI replay_no_stall \fR=\fPint -While replaying I/O patterns using \fBread_iolog\fR the default behavior -attempts to respect timing information between I/Os. Enabling -\fBreplay_no_stall\fR causes I/Os to be replayed as fast as possible while -still respecting ordering. +.B bw +Collect bandwidth data. Stop the job if all individual bandwidth +measurements are within the specified limit of the mean bandwidth. .TP -.BI replay_redirect \fR=\fPstr -While replaying I/O patterns using \fBread_iolog\fR the default behavior -is to replay the IOPS onto the major/minor device that each IOP was recorded -from. Setting \fBreplay_redirect\fR causes all IOPS to be replayed onto the -single specified device regardless of the device it was recorded from. +.B bw_slope +Collect bandwidth data and calculate the least squares regression +slope. Stop the job if the slope falls below the specified limit. +.RE +.RE .TP -.BI replay_align \fR=\fPint -Force alignment of IO offsets and lengths in a trace to this power of 2 value. +.BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime +A rolling window of this duration will be used to judge whether steady state +has been reached. Data will be collected once per second. The default is 0 +which disables steady state detection. When the unit is omitted, the +value is interpreted in seconds. .TP -.BI replay_scale \fR=\fPint -Scale sector offsets down by this factor when replaying traces. +.BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime +Allow the job to run for the specified duration before beginning data +collection for checking the steady state job termination criterion. The +default is 0. When the unit is omitted, the value is interpreted in seconds. +.SS "Measurements and reporting" .TP .BI per_job_logs \fR=\fPbool If set, this generates bw/clat/iops log with per file private filenames. If -not set, jobs with identical names will share the log filename. Default: true. +not set, jobs with identical names will share the log filename. Default: +true. +.TP +.BI group_reporting +It may sometimes be interesting to display statistics for groups of jobs as +a whole instead of for each individual job. This is especially true if +\fBnumjobs\fR is used; looking at individual thread/process output +quickly becomes unwieldy. To see the final report per\-group instead of +per\-job, use \fBgroup_reporting\fR. Jobs in a file will be part of the +same reporting group, unless if separated by a \fBstonewall\fR, or by +using \fBnew_group\fR. +.TP +.BI new_group +Start a new reporting group. See: \fBgroup_reporting\fR. If not given, +all jobs in a file will be part of the same reporting group, unless +separated by a \fBstonewall\fR. +.TP +.BI stats \fR=\fPbool +By default, fio collects and shows final output results for all jobs +that run. If this option is set to 0, then fio will ignore it in +the final stat output. .TP .BI write_bw_log \fR=\fPstr -If given, write a bandwidth log for this job. Can be used to store data of the -bandwidth of the jobs in their lifetime. The included fio_generate_plots script -uses gnuplot to turn these text files into nice graphs. See \fBwrite_lat_log\fR -for behaviour of given filename. For this option, the postfix is _bw.x.log, -where x is the index of the job (1..N, where N is the number of jobs). If -\fBper_job_logs\fR is false, then the filename will not include the job index. -See the \fBLOG FILE FORMATS\fR -section. +If given, write a bandwidth log for this job. Can be used to store data of +the bandwidth of the jobs in their lifetime. The included +\fBfio_generate_plots\fR script uses gnuplot to turn these +text files into nice graphs. See \fBwrite_lat_log\fR for behavior of +given filename. For this option, the postfix is `_bw.x.log', where `x' +is the index of the job (1..N, where N is the number of jobs). If +\fBper_job_logs\fR is false, then the filename will not include the job +index. See \fBLOG FILE FORMATS\fR section. .TP .BI write_lat_log \fR=\fPstr -Same as \fBwrite_bw_log\fR, but writes I/O completion latencies. If no -filename is given with this option, the default filename of -"jobname_type.x.log" is used, where x is the index of the job (1..N, where -N is the number of jobs). Even if the filename is given, fio will still -append the type of log. If \fBper_job_logs\fR is false, then the filename will -not include the job index. See the \fBLOG FILE FORMATS\fR section. +Same as \fBwrite_bw_log\fR, except that this option stores I/O +submission, completion, and total latencies instead. If no filename is given +with this option, the default filename of `jobname_type.log' is +used. Even if the filename is given, fio will still append the type of +log. So if one specifies: +.RS +.RS +.P +write_lat_log=foo +.RE +.P +The actual log names will be `foo_slat.x.log', `foo_clat.x.log', +and `foo_lat.x.log', where `x' is the index of the job (1..N, where N +is the number of jobs). This helps \fBfio_generate_plots\fR find the +logs automatically. If \fBper_job_logs\fR is false, then the filename +will not include the job index. See \fBLOG FILE FORMATS\fR section. +.RE .TP .BI write_hist_log \fR=\fPstr -Same as \fBwrite_lat_log\fR, but writes I/O completion latency histograms. If -no filename is given with this option, the default filename of -"jobname_clat_hist.x.log" is used, where x is the index of the job (1..N, where -N is the number of jobs). Even if the filename is given, fio will still append -the type of log. If \fBper_job_logs\fR is false, then the filename will not -include the job index. See the \fBLOG FILE FORMATS\fR section. +Same as \fBwrite_lat_log\fR, but writes I/O completion latency +histograms. If no filename is given with this option, the default filename +of `jobname_clat_hist.x.log' is used, where `x' is the index of the +job (1..N, where N is the number of jobs). Even if the filename is given, +fio will still append the type of log. If \fBper_job_logs\fR is false, +then the filename will not include the job index. See \fBLOG FILE FORMATS\fR section. .TP .BI write_iops_log \fR=\fPstr -Same as \fBwrite_bw_log\fR, but writes IOPS. If no filename is given with this -option, the default filename of "jobname_type.x.log" is used, where x is the -index of the job (1..N, where N is the number of jobs). Even if the filename -is given, fio will still append the type of log. If \fBper_job_logs\fR is false, -then the filename will not include the job index. See the \fBLOG FILE FORMATS\fR -section. +Same as \fBwrite_bw_log\fR, but writes IOPS. If no filename is given +with this option, the default filename of `jobname_type.x.log' is +used, where `x' is the index of the job (1..N, where N is the number of +jobs). Even if the filename is given, fio will still append the type of +log. If \fBper_job_logs\fR is false, then the filename will not include +the job index. See \fBLOG FILE FORMATS\fR section. .TP .BI log_avg_msec \fR=\fPint By default, fio will log an entry in the iops, latency, or bw log for every -IO that completes. When writing to the disk log, that can quickly grow to a +I/O that completes. When writing to the disk log, that can quickly grow to a very large size. Setting this option makes fio average the each log entry over the specified period of time, reducing the resolution of the log. See -\fBlog_max_value\fR as well. Defaults to 0, logging all entries. -.TP -.BI log_max_value \fR=\fPbool -If \fBlog_avg_msec\fR is set, fio logs the average over that window. If you -instead want to log the maximum value, set this option to 1. Defaults to -0, meaning that averaged values are logged. +\fBlog_max_value\fR as well. Defaults to 0, logging all entries. +Also see \fBLOG FILE FORMATS\fR section. .TP .BI log_hist_msec \fR=\fPint -Same as \fBlog_avg_msec\fR, but logs entries for completion latency histograms. -Computing latency percentiles from averages of intervals using \fBlog_avg_msec\fR -is innacurate. Setting this option makes fio log histogram entries over the -specified period of time, reducing log sizes for high IOPS devices while -retaining percentile accuracy. See \fBlog_hist_coarseness\fR as well. Defaults -to 0, meaning histogram logging is disabled. +Same as \fBlog_avg_msec\fR, but logs entries for completion latency +histograms. Computing latency percentiles from averages of intervals using +\fBlog_avg_msec\fR is inaccurate. Setting this option makes fio log +histogram entries over the specified period of time, reducing log sizes for +high IOPS devices while retaining percentile accuracy. See +\fBlog_hist_coarseness\fR as well. Defaults to 0, meaning histogram +logging is disabled. .TP .BI log_hist_coarseness \fR=\fPint -Integer ranging from 0 to 6, defining the coarseness of the resolution of the -histogram logs enabled with \fBlog_hist_msec\fR. For each increment in -coarseness, fio outputs half as many bins. Defaults to 0, for which histogram -logs contain 1216 latency bins. See the \fBLOG FILE FORMATS\fR section. +Integer ranging from 0 to 6, defining the coarseness of the resolution of +the histogram logs enabled with \fBlog_hist_msec\fR. For each increment +in coarseness, fio outputs half as many bins. Defaults to 0, for which +histogram logs contain 1216 latency bins. See \fBLOG FILE FORMATS\fR section. +.TP +.BI log_max_value \fR=\fPbool +If \fBlog_avg_msec\fR is set, fio logs the average over that window. If +you instead want to log the maximum value, set this option to 1. Defaults to +0, meaning that averaged values are logged. .TP .BI log_offset \fR=\fPbool -If this is set, the iolog options will include the byte offset for the IO -entry as well as the other data values. +If this is set, the iolog options will include the byte offset for the I/O +entry as well as the other data values. Defaults to 0 meaning that +offsets are not present in logs. Also see \fBLOG FILE FORMATS\fR section. .TP .BI log_compression \fR=\fPint -If this is set, fio will compress the IO logs as it goes, to keep the memory -footprint lower. When a log reaches the specified size, that chunk is removed -and compressed in the background. Given that IO logs are fairly highly -compressible, this yields a nice memory savings for longer runs. The downside -is that the compression will consume some background CPU cycles, so it may -impact the run. This, however, is also true if the logging ends up consuming -most of the system memory. So pick your poison. The IO logs are saved -normally at the end of a run, by decompressing the chunks and storing them -in the specified log file. This feature depends on the availability of zlib. +If this is set, fio will compress the I/O logs as it goes, to keep the +memory footprint lower. When a log reaches the specified size, that chunk is +removed and compressed in the background. Given that I/O logs are fairly +highly compressible, this yields a nice memory savings for longer runs. The +downside is that the compression will consume some background CPU cycles, so +it may impact the run. This, however, is also true if the logging ends up +consuming most of the system memory. So pick your poison. The I/O logs are +saved normally at the end of a run, by decompressing the chunks and storing +them in the specified log file. This feature depends on the availability of +zlib. .TP .BI log_compression_cpus \fR=\fPstr -Define the set of CPUs that are allowed to handle online log compression -for the IO jobs. This can provide better isolation between performance +Define the set of CPUs that are allowed to handle online log compression for +the I/O jobs. This can provide better isolation between performance sensitive jobs, and background compression work. .TP .BI log_store_compressed \fR=\fPbool If set, fio will store the log files in a compressed format. They can be -decompressed with fio, using the \fB\-\-inflate-log\fR command line parameter. -The files will be stored with a \fB\.fz\fR suffix. +decompressed with fio, using the \fB\-\-inflate\-log\fR command line +parameter. The files will be stored with a `.fz' suffix. .TP .BI log_unix_epoch \fR=\fPbool If set, fio will log Unix timestamps to the log files produced by enabling -\fBwrite_type_log\fR for each log type, instead of the default zero-based +write_type_log for each log type, instead of the default zero\-based timestamps. .TP .BI block_error_percentiles \fR=\fPbool -If set, record errors in trim block-sized units from writes and trims and output -a histogram of how many trims it took to get to errors, and what kind of error -was encountered. +If set, record errors in trim block\-sized units from writes and trims and +output a histogram of how many trims it took to get to errors, and what kind +of error was encountered. +.TP +.BI bwavgtime \fR=\fPint +Average the calculated bandwidth over the given time. Value is specified in +milliseconds. If the job also does bandwidth logging through +\fBwrite_bw_log\fR, then the minimum of this option and +\fBlog_avg_msec\fR will be used. Default: 500ms. +.TP +.BI iopsavgtime \fR=\fPint +Average the calculated IOPS over the given time. Value is specified in +milliseconds. If the job also does IOPS logging through +\fBwrite_iops_log\fR, then the minimum of this option and +\fBlog_avg_msec\fR will be used. Default: 500ms. +.TP +.BI disk_util \fR=\fPbool +Generate disk utilization statistics, if the platform supports it. +Default: true. .TP .BI disable_lat \fR=\fPbool -Disable measurements of total latency numbers. Useful only for cutting -back the number of calls to \fBgettimeofday\fR\|(2), as that does impact performance at -really high IOPS rates. Note that to really get rid of a large amount of these -calls, this option must be used with disable_slat and disable_bw as well. +Disable measurements of total latency numbers. Useful only for cutting back +the number of calls to \fBgettimeofday\fR\|(2), as that does impact +performance at really high IOPS rates. Note that to really get rid of a +large amount of these calls, this option must be used with +\fBdisable_slat\fR and \fBdisable_bw_measurement\fR as well. .TP .BI disable_clat \fR=\fPbool -Disable measurements of completion latency numbers. See \fBdisable_lat\fR. +Disable measurements of completion latency numbers. See +\fBdisable_lat\fR. .TP .BI disable_slat \fR=\fPbool -Disable measurements of submission latency numbers. See \fBdisable_lat\fR. +Disable measurements of submission latency numbers. See +\fBdisable_lat\fR. .TP -.BI disable_bw_measurement \fR=\fPbool -Disable measurements of throughput/bandwidth numbers. See \fBdisable_lat\fR. +.BI disable_bw_measurement \fR=\fPbool "\fR,\fP disable_bw" \fR=\fPbool +Disable measurements of throughput/bandwidth numbers. See +\fBdisable_lat\fR. .TP -.BI lockmem \fR=\fPint -Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to -simulate a smaller amount of memory. The amount specified is per worker. +.BI clat_percentiles \fR=\fPbool +Enable the reporting of percentiles of completion latencies. This option is +mutually exclusive with \fBlat_percentiles\fR. .TP -.BI exec_prerun \fR=\fPstr -Before running the job, execute the specified command with \fBsystem\fR\|(3). -.RS -Output is redirected in a file called \fBjobname.prerun.txt\fR -.RE +.BI lat_percentiles \fR=\fPbool +Enable the reporting of percentiles of IO latencies. This is similar to +\fBclat_percentiles\fR, except that this includes the submission latency. +This option is mutually exclusive with \fBclat_percentiles\fR. .TP -.BI exec_postrun \fR=\fPstr -Same as \fBexec_prerun\fR, but the command is executed after the job completes. +.BI percentile_list \fR=\fPfloat_list +Overwrite the default list of percentiles for completion latencies and the +block error histogram. Each number is a floating number in the range +(0,100], and the maximum length of the list is 20. Use ':' to separate the +numbers, and list the numbers in ascending order. For example, +`\-\-percentile_list=99.5:99.9' will cause fio to report the values of +completion latency below which 99.5% and 99.9% of the observed latencies +fell, respectively. +.SS "Error handling" +.TP +.BI exitall_on_error +When one job finishes in error, terminate the rest. The default is to wait +for each job to finish. +.TP +.BI continue_on_error \fR=\fPstr +Normally fio will exit the job on the first observed failure. If this option +is set, fio will continue the job when there is a 'non\-fatal error' (EIO or +EILSEQ) until the runtime is exceeded or the I/O size specified is +completed. If this option is used, there are two more stats that are +appended, the total error count and the first error. The error field given +in the stats is the first error that was hit during the run. +The allowed values are: +.RS .RS -Output is redirected in a file called \fBjobname.postrun.txt\fR -.RE -.TP -.BI ioscheduler \fR=\fPstr -Attempt to switch the device hosting the file to the specified I/O scheduler. .TP -.BI disk_util \fR=\fPbool -Generate disk utilization statistics if the platform supports it. Default: true. +.B none +Exit on any I/O or verify errors. .TP -.BI clocksource \fR=\fPstr -Use the given clocksource as the base of timing. The supported options are: -.RS +.B read +Continue on read errors, exit on all others. .TP -.B gettimeofday -\fBgettimeofday\fR\|(2) +.B write +Continue on write errors, exit on all others. .TP -.B clock_gettime -\fBclock_gettime\fR\|(2) +.B io +Continue on any I/O error, exit on all others. .TP -.B cpu -Internal CPU clock source +.B verify +Continue on verify errors, exit on all others. .TP -.RE -.P -\fBcpu\fR is the preferred clocksource if it is reliable, as it is very fast -(and fio is heavy on time calls). Fio will automatically use this clocksource -if it's supported and considered reliable on the system it is running on, -unless another clocksource is specifically set. For x86/x86-64 CPUs, this -means supporting TSC Invariant. +.B all +Continue on all errors. .TP -.BI gtod_reduce \fR=\fPbool -Enable all of the \fBgettimeofday\fR\|(2) reducing options (disable_clat, disable_slat, -disable_bw) plus reduce precision of the timeout somewhat to really shrink the -\fBgettimeofday\fR\|(2) call count. With this option enabled, we only do about 0.4% of -the gtod() calls we would have done if all time keeping was enabled. +.B 0 +Backward\-compatible alias for 'none'. .TP -.BI gtod_cpu \fR=\fPint -Sometimes it's cheaper to dedicate a single thread of execution to just getting -the current time. Fio (and databases, for instance) are very intensive on -\fBgettimeofday\fR\|(2) calls. With this option, you can set one CPU aside for doing -nothing but logging current time to a shared memory location. Then the other -threads/processes that run IO workloads need only copy that segment, instead of -entering the kernel with a \fBgettimeofday\fR\|(2) call. The CPU set aside for doing -these time calls will be excluded from other uses. Fio will manually clear it -from the CPU mask of other jobs. +.B 1 +Backward\-compatible alias for 'all'. +.RE +.RE .TP .BI ignore_error \fR=\fPstr -Sometimes you want to ignore some errors during test in that case you can specify -error list for each error type. -.br -ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST -.br -errors for given error type is separated with ':'. -Error may be symbol ('ENOSPC', 'ENOMEM') or an integer. -.br -Example: ignore_error=EAGAIN,ENOSPC:122 . -.br -This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from WRITE. +Sometimes you want to ignore some errors during test in that case you can +specify error list for each error type, instead of only being able to +ignore the default 'non\-fatal error' using \fBcontinue_on_error\fR. +`ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST' errors for +given error type is separated with ':'. Error may be symbol ('ENOSPC', 'ENOMEM') +or integer. Example: +.RS +.RS +.P +ignore_error=EAGAIN,ENOSPC:122 +.RE +.P +This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from +WRITE. This option works by overriding \fBcontinue_on_error\fR with +the list of errors for each error type if any. +.RE .TP .BI error_dump \fR=\fPbool -If set dump every error even if it is non fatal, true by default. If disabled -only fatal error will be dumped +If set dump every error even if it is non fatal, true by default. If +disabled only fatal error will be dumped. +.SS "Running predefined workloads" +Fio includes predefined profiles that mimic the I/O workloads generated by +other tools. .TP .BI profile \fR=\fPstr -Select a specific builtin performance test. -.TP -.BI cgroup \fR=\fPstr -Add job to this control group. If it doesn't exist, it will be created. -The system must have a mounted cgroup blkio mount point for this to work. If -your system doesn't have it mounted, you can do so with: - -# mount \-t cgroup \-o blkio none /cgroup -.TP -.BI cgroup_weight \fR=\fPint -Set the weight of the cgroup to this value. See the documentation that comes -with the kernel, allowed values are in the range of 100..1000. -.TP -.BI cgroup_nodelete \fR=\fPbool -Normally fio will delete the cgroups it has created after the job completion. -To override this behavior and to leave cgroups around after the job completion, -set cgroup_nodelete=1. This can be useful if one wants to inspect various -cgroup files after job completion. Default: false -.TP -.BI uid \fR=\fPint -Instead of running as the invoking user, set the user ID to this value before -the thread/process does any work. -.TP -.BI gid \fR=\fPint -Set group ID, see \fBuid\fR. -.TP -.BI unit_base \fR=\fPint -Base unit for reporting. Allowed values are: +The predefined workload to run. Current profiles are: +.RS .RS .TP -.B 0 -Use auto-detection (default). -.TP -.B 8 -Byte based. +.B tiobench +Threaded I/O bench (tiotest/tiobench) like workload. .TP -.B 1 -Bit based. +.B act +Aerospike Certification Tool (ACT) like workload. +.RE .RE .P -.TP -.BI flow_id \fR=\fPint -The ID of the flow. If not specified, it defaults to being a global flow. See -\fBflow\fR. -.TP -.BI flow \fR=\fPint -Weight in token-based flow control. If this value is used, then there is a -\fBflow counter\fR which is used to regulate the proportion of activity between -two or more jobs. fio attempts to keep this flow counter near zero. The -\fBflow\fR parameter stands for how much should be added or subtracted to the -flow counter on each iteration of the main I/O loop. That is, if one job has -\fBflow=8\fR and another job has \fBflow=-1\fR, then there will be a roughly -1:8 ratio in how much one runs vs the other. -.TP -.BI flow_watermark \fR=\fPint -The maximum value that the absolute value of the flow counter is allowed to -reach before the job must wait for a lower value of the counter. -.TP -.BI flow_sleep \fR=\fPint -The period of time, in microseconds, to wait after the flow watermark has been -exceeded before retrying operations -.TP -.BI clat_percentiles \fR=\fPbool -Enable the reporting of percentiles of completion latencies. -.TP -.BI percentile_list \fR=\fPfloat_list -Overwrite the default list of percentiles for completion latencies and the -block error histogram. Each number is a floating number in the range (0,100], -and the maximum length of the list is 20. Use ':' to separate the -numbers. For example, \-\-percentile_list=99.5:99.9 will cause fio to -report the values of completion latency below which 99.5% and 99.9% of -the observed latencies fell, respectively. -.SS "Ioengine Parameters List" -Some parameters are only valid when a specific ioengine is in use. These are -used identically to normal parameters, with the caveat that when used on the -command line, they must come after the ioengine. -.TP -.BI (cpuio)cpuload \fR=\fPint -Attempt to use the specified percentage of CPU cycles. -.TP -.BI (cpuio)cpuchunks \fR=\fPint -Split the load into cycles of the given time. In microseconds. -.TP -.BI (cpuio)exit_on_io_done \fR=\fPbool -Detect when IO threads are done, then exit. -.TP -.BI (libaio)userspace_reap -Normally, with the libaio engine in use, fio will use -the io_getevents system call to reap newly returned events. -With this flag turned on, the AIO ring will be read directly -from user-space to reap events. The reaping mode is only -enabled when polling for a minimum of 0 events (eg when -iodepth_batch_complete=0). -.TP -.BI (pvsync2)hipri -Set RWF_HIPRI on IO, indicating to the kernel that it's of -higher priority than normal. -.TP -.BI (net,netsplice)hostname \fR=\fPstr -The host name or IP address to use for TCP or UDP based IO. -If the job is a TCP listener or UDP reader, the hostname is not -used and must be omitted unless it is a valid UDP multicast address. -.TP -.BI (net,netsplice)port \fR=\fPint -The TCP or UDP port to bind to or connect to. If this is used with -\fBnumjobs\fR to spawn multiple instances of the same job type, then -this will be the starting port number since fio will use a range of ports. -.TP -.BI (net,netsplice)interface \fR=\fPstr -The IP address of the network interface used to send or receive UDP multicast -packets. -.TP -.BI (net,netsplice)ttl \fR=\fPint -Time-to-live value for outgoing UDP multicast packets. Default: 1 -.TP -.BI (net,netsplice)nodelay \fR=\fPbool -Set TCP_NODELAY on TCP connections. -.TP -.BI (net,netsplice)protocol \fR=\fPstr "\fR,\fP proto" \fR=\fPstr -The network protocol to use. Accepted values are: -.RS +To view a profile's additional options use \fB\-\-cmdhelp\fR after specifying +the profile. For example: .RS .TP -.B tcp -Transmission control protocol -.TP -.B tcpv6 -Transmission control protocol V6 +$ fio \-\-profile=act \-\-cmdhelp +.RE +.SS "Act profile options" .TP -.B udp -User datagram protocol +.BI device\-names \fR=\fPstr +Devices to use. .TP -.B udpv6 -User datagram protocol V6 +.BI load \fR=\fPint +ACT load multiplier. Default: 1. .TP -.B unix -UNIX domain socket -.RE -.P -When the protocol is TCP or UDP, the port must also be given, -as well as the hostname if the job is a TCP listener or UDP -reader. For unix sockets, the normal filename option should be -used and the port is invalid. -.RE -.TP -.BI (net,netsplice)listen -For TCP network connections, tell fio to listen for incoming -connections rather than initiating an outgoing connection. The -hostname must be omitted if this option is used. -.TP -.BI (net, pingpong) \fR=\fPbool -Normally a network writer will just continue writing data, and a network reader -will just consume packets. If pingpong=1 is set, a writer will send its normal -payload to the reader, then wait for the reader to send the same payload back. -This allows fio to measure network latencies. The submission and completion -latencies then measure local time spent sending or receiving, and the -completion latency measures how long it took for the other end to receive and -send back. For UDP multicast traffic pingpong=1 should only be set for a single -reader when multiple readers are listening to the same address. +.BI test\-duration\fR=\fPtime +How long the entire test takes to run. When the unit is omitted, the value +is given in seconds. Default: 24h. .TP -.BI (net, window_size) \fR=\fPint -Set the desired socket buffer size for the connection. +.BI threads\-per\-queue\fR=\fPint +Number of read I/O threads per device. Default: 8. .TP -.BI (net, mss) \fR=\fPint -Set the TCP maximum segment size (TCP_MAXSEG). +.BI read\-req\-num\-512\-blocks\fR=\fPint +Number of 512B blocks to read at the time. Default: 3. .TP -.BI (e4defrag,donorname) \fR=\fPstr -File will be used as a block donor (swap extents between files) +.BI large\-block\-op\-kbytes\fR=\fPint +Size of large block ops in KiB (writes). Default: 131072. .TP -.BI (e4defrag,inplace) \fR=\fPint -Configure donor file block allocation strategy -.RS -.BI 0(default) : -Preallocate donor's file on init +.BI prep +Set to run ACT prep phase. +.SS "Tiobench profile options" .TP -.BI 1: -allocate space immediately inside defragment event, and free right after event -.RE -.TP -.BI (rbd)clustername \fR=\fPstr -Specifies the name of the ceph cluster. +.BI size\fR=\fPstr +Size in MiB. .TP -.BI (rbd)rbdname \fR=\fPstr -Specifies the name of the RBD. +.BI block\fR=\fPint +Block size in bytes. Default: 4096. .TP -.BI (rbd)pool \fR=\fPstr -Specifies the name of the Ceph pool containing the RBD. +.BI numruns\fR=\fPint +Number of runs. .TP -.BI (rbd)clientname \fR=\fPstr -Specifies the username (without the 'client.' prefix) used to access the Ceph -cluster. If the clustername is specified, the clientname shall be the full -type.id string. If no type. prefix is given, fio will add 'client.' by default. +.BI dir\fR=\fPstr +Test directory. .TP -.BI (mtd)skipbad \fR=\fPbool -Skip operations against known bad blocks. +.BI threads\fR=\fPint +Number of threads. .SH OUTPUT -While running, \fBfio\fR will display the status of the created jobs. For -example: -.RS -.P -Threads: 1: [_r] [24.8% done] [ 13509/ 8334 kb/s] [eta 00h:01m:31s] -.RE +Fio spits out a lot of output. While running, fio will display the status of the +jobs created. An example of that would be: .P -The characters in the first set of brackets denote the current status of each -threads. The possible values are: -.P -.PD 0 +.nf + Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s] +.fi +.P +The characters inside the first set of square brackets denote the current status of +each thread. The first character is the first job defined in the job file, and so +forth. The possible values (in typical life cycle order) are: .RS .TP +.PD 0 .B P -Setup but not started. +Thread setup, but not started. .TP .B C Thread created. .TP .B I -Initialized, waiting. +Thread initialized, waiting or generating necessary data. +.TP +.B P +Thread running pre\-reading file(s). +.TP +.B / +Thread is in ramp period. .TP .B R Running, doing sequential reads. @@ -1949,563 +2734,759 @@ .B m Running, doing mixed random reads/writes. .TP +.B D +Running, doing sequential trims. +.TP +.B d +Running, doing random trims. +.TP .B F Running, currently waiting for \fBfsync\fR\|(2). .TP .B V -Running, verifying written data. +Running, doing verification of written data. +.TP +.B f +Thread finishing. .TP .B E -Exited, not reaped by main thread. +Thread exited, not reaped by main thread yet. .TP .B \- -Exited, thread reaped. -.RE -.PD -.P -The second set of brackets shows the estimated completion percentage of -the current group. The third set shows the read and write I/O rate, -respectively. Finally, the estimated run time of the job is displayed. -.P -When \fBfio\fR completes (or is interrupted by Ctrl-C), it will show data -for each thread, each group of threads, and each disk, in that order. -.P -Per-thread statistics first show the threads client number, group-id, and -error code. The remaining figures are as follows: -.RS -.TP -.B io -Number of megabytes of I/O performed. +Thread reaped. .TP -.B bw -Average data rate (bandwidth). +.B X +Thread reaped, exited with an error. .TP -.B runt -Threads run time. +.B K +Thread reaped, exited due to signal. +.PD +.RE +.P +Fio will condense the thread string as not to take up more space on the command +line than needed. For instance, if you have 10 readers and 10 writers running, +the output would look like this: +.P +.nf + Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s] +.fi +.P +Note that the status string is displayed in order, so it's possible to tell which of +the jobs are currently doing what. In the example above this means that jobs 1\-\-10 +are readers and 11\-\-20 are writers. +.P +The other values are fairly self explanatory \-\- number of threads currently +running and doing I/O, the number of currently open files (f=), the estimated +completion percentage, the rate of I/O since last check (read speed listed first, +then write speed and optionally trim speed) in terms of bandwidth and IOPS, +and time to completion for the current running group. It's impossible to estimate +runtime of the following groups (if any). +.P +When fio is done (or interrupted by Ctrl\-C), it will show the data for +each thread, group of threads, and disks in that order. For each overall thread (or +group) the output looks like: +.P +.nf + Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017 + write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec) + slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50 + clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31 + lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79 + clat percentiles (usec): + | 1.00th=[ 302], 5.00th=[ 326], 10.00th=[ 343], 20.00th=[ 363], + | 30.00th=[ 392], 40.00th=[ 404], 50.00th=[ 416], 60.00th=[ 445], + | 70.00th=[ 816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627], + | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877], + | 99.99th=[78119] + bw ( KiB/s): min= 532, max= 686, per=0.10%, avg=622.87, stdev=24.82, samples= 100 + iops : min= 76, max= 98, avg=88.98, stdev= 3.54, samples= 100 + lat (usec) : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79% + lat (msec) : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37% + lat (msec) : 100=0.65% + cpu : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21 + IO depths : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0% + submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% + complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% + issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0 + latency : target=0, window=0, percentile=100.00%, depth=8 +.fi +.P +The job name (or first job's name when using \fBgroup_reporting\fR) is printed, +along with the group id, count of jobs being aggregated, last error id seen (which +is 0 when there are no errors), pid/tid of that thread and the time the job/group +completed. Below are the I/O statistics for each data direction performed (showing +writes in the example above). In the order listed, they denote: +.RS +.TP +.B read/write/trim +The string before the colon shows the I/O direction the statistics +are for. \fIIOPS\fR is the average I/Os performed per second. \fIBW\fR +is the average bandwidth rate shown as: value in power of 2 format +(value in power of 10 format). The last two values show: (total +I/O performed in power of 2 format / \fIruntime\fR of that thread). .TP .B slat -Submission latency minimum, maximum, average and standard deviation. This is -the time it took to submit the I/O. +Submission latency (\fImin\fR being the minimum, \fImax\fR being the +maximum, \fIavg\fR being the average, \fIstdev\fR being the standard +deviation). This is the time it took to submit the I/O. For +sync I/O this row is not displayed as the slat is really the +completion latency (since queue/complete is one operation there). +This value can be in nanoseconds, microseconds or milliseconds \-\-\- +fio will choose the most appropriate base and print that (in the +example above nanoseconds was the best scale). Note: in \fB\-\-minimal\fR mode +latencies are always expressed in microseconds. .TP .B clat -Completion latency minimum, maximum, average and standard deviation. This -is the time between submission and completion. +Completion latency. Same names as slat, this denotes the time from +submission to completion of the I/O pieces. For sync I/O, clat will +usually be equal (or very close) to 0, as the time from submit to +complete is basically just CPU time (I/O has already been done, see slat +explanation). +.TP +.B lat +Total latency. Same names as slat and clat, this denotes the time from +when fio created the I/O unit to completion of the I/O operation. .TP .B bw -Bandwidth minimum, maximum, percentage of aggregate bandwidth received, average -and standard deviation. +Bandwidth statistics based on samples. Same names as the xlat stats, +but also includes the number of samples taken (\fIsamples\fR) and an +approximate percentage of total aggregate bandwidth this thread +received in its group (\fIper\fR). This last value is only really +useful if the threads in this group are on the same disk, since they +are then competing for disk access. +.TP +.B iops +IOPS statistics based on samples. Same names as \fBbw\fR. +.TP +.B lat (nsec/usec/msec) +The distribution of I/O completion latencies. This is the time from when +I/O leaves fio and when it gets completed. Unlike the separate +read/write/trim sections above, the data here and in the remaining +sections apply to all I/Os for the reporting group. 250=0.04% means that +0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11% +of the I/Os required 250 to 499us for completion. .TP .B cpu -CPU usage statistics. Includes user and system time, number of context switches -this thread went through and number of major and minor page faults. The CPU -utilization numbers are averages for the jobs in that reporting group, while -the context and fault counters are summed. +CPU usage. User and system time, along with the number of context +switches this thread went through, usage of system and user time, and +finally the number of major and minor page faults. The CPU utilization +numbers are averages for the jobs in that reporting group, while the +context and fault counters are summed. .TP .B IO depths -Distribution of I/O depths. Each depth includes everything less than (or equal) -to it, but greater than the previous depth. -.TP -.B IO issued -Number of read/write requests issued, and number of short read/write requests. -.TP -.B IO latencies -Distribution of I/O completion latencies. The numbers follow the same pattern -as \fBIO depths\fR. -.RE +The distribution of I/O depths over the job lifetime. The numbers are +divided into powers of 2 and each entry covers depths from that value +up to those that are lower than the next entry \-\- e.g., 16= covers +depths from 16 to 31. Note that the range covered by a depth +distribution entry can be different to the range covered by the +equivalent \fBsubmit\fR/\fBcomplete\fR distribution entry. +.TP +.B IO submit +How many pieces of I/O were submitting in a single submit call. Each +entry denotes that amount and below, until the previous entry \-\- e.g., +16=100% means that we submitted anywhere between 9 to 16 I/Os per submit +call. Note that the range covered by a \fBsubmit\fR distribution entry can +be different to the range covered by the equivalent depth distribution +entry. +.TP +.B IO complete +Like the above \fBsubmit\fR number, but for completions instead. +.TP +.B IO issued rwt +The number of \fBread/write/trim\fR requests issued, and how many of them were +short or dropped. +.TP +.B IO latency +These values are for \fBlatency-target\fR and related options. When +these options are engaged, this section describes the I/O depth required +to meet the specified latency target. +.RE +.P +After each client has been listed, the group statistics are printed. They +will look like this: +.P +.nf + Run status group 0 (all jobs): + READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s\-10.8MiB/s (10.9MB/s\-11.3MB/s), io=64.0MiB (67.1MB), run=2973\-3069msec + WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s\-621KiB/s (630kB/s\-636kB/s), io=64.0MiB (67.1MB), run=52747\-53223msec +.fi .P -The group statistics show: -.PD 0 +For each data direction it prints: .RS .TP -.B io -Number of megabytes I/O performed. -.TP -.B aggrb -Aggregate bandwidth of threads in the group. -.TP -.B minb -Minimum average bandwidth a thread saw. -.TP -.B maxb -Maximum average bandwidth a thread saw. +.B bw +Aggregate bandwidth of threads in this group followed by the +minimum and maximum bandwidth of all the threads in this group. +Values outside of brackets are power\-of\-2 format and those +within are the equivalent value in a power\-of\-10 format. .TP -.B mint -Shortest runtime of threads in the group. +.B io +Aggregate I/O performed of all threads in this group. The +format is the same as \fBbw\fR. .TP -.B maxt -Longest runtime of threads in the group. +.B run +The smallest and longest runtimes of the threads in this group. .RE -.PD .P -Finally, disk statistics are printed with reads first: -.PD 0 +And finally, the disk statistics are printed. This is Linux specific. +They will look like this: +.P +.nf + Disk stats (read/write): + sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00% +.fi +.P +Each value is printed for both reads and writes, with reads first. The +numbers denote: .RS .TP .B ios Number of I/Os performed by all groups. .TP .B merge -Number of merges in the I/O scheduler. +Number of merges performed by the I/O scheduler. .TP .B ticks Number of ticks we kept the disk busy. .TP -.B io_queue +.B in_queue Total time spent in the disk queue. .TP .B util -Disk utilization. +The disk utilization. A value of 100% means we kept the disk +busy constantly, 50% would be a disk idling half of the time. .RE -.PD .P -It is also possible to get fio to dump the current output while it is -running, without terminating the job. To do that, send fio the \fBUSR1\fR -signal. +It is also possible to get fio to dump the current output while it is running, +without terminating the job. To do that, send fio the USR1 signal. You can +also get regularly timed dumps by using the \fB\-\-status\-interval\fR +parameter, or by creating a file in `/tmp' named +`fio\-dump\-status'. If fio sees this file, it will unlink it and dump the +current output status. .SH TERSE OUTPUT -If the \fB\-\-minimal\fR / \fB\-\-append-terse\fR options are given, the -results will be printed/appended in a semicolon-delimited format suitable for -scripted use. -A job description (if provided) follows on a new line. Note that the first -number in the line is the version number. If the output has to be changed -for some reason, this number will be incremented by 1 to signify that -change. The fields are: +For scripted usage where you typically want to generate tables or graphs of the +results, fio can output the results in a semicolon separated format. The format +is one long line of values, such as: .P -.RS -.B terse version, fio version, jobname, groupid, error +.nf + 2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00% + A description of this job goes here. +.fi .P -Read status: -.RS -.B Total I/O \fR(KB)\fP, bandwidth \fR(KB/s)\fP, IOPS, runtime \fR(ms)\fP +The job description (if provided) follows on a second line. .P -Submission latency: -.RS -.B min, max, mean, standard deviation -.RE -Completion latency: -.RS -.B min, max, mean, standard deviation -.RE -Completion latency percentiles (20 fields): -.RS -.B Xth percentile=usec -.RE -Total latency: -.RS -.B min, max, mean, standard deviation -.RE -Bandwidth: -.RS -.B min, max, aggregate percentage of total, mean, standard deviation -.RE -.RE +To enable terse output, use the \fB\-\-minimal\fR or +`\-\-output\-format=terse' command line options. The +first value is the version of the terse output format. If the output has to be +changed for some reason, this number will be incremented by 1 to signify that +change. .P -Write status: -.RS -.B Total I/O \fR(KB)\fP, bandwidth \fR(KB/s)\fP, IOPS, runtime \fR(ms)\fP +Split up, the format is as follows (comments in brackets denote when a +field was introduced or whether it's specific to some terse version): .P -Submission latency: +.nf + terse version, fio version [v3], jobname, groupid, error +.fi .RS -.B min, max, mean, standard deviation +.P +.B +READ status: .RE -Completion latency: +.P +.nf + Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec) + Submission latency: min, max, mean, stdev (usec) + Completion latency: min, max, mean, stdev (usec) + Completion latency percentiles: 20 fields (see below) + Total latency: min, max, mean, stdev (usec) + Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5] + IOPS [v5]: min, max, mean, stdev, number of samples +.fi .RS -.B min, max, mean, standard deviation +.P +.B +WRITE status: .RE -Completion latency percentiles (20 fields): +.P +.nf + Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec) + Submission latency: min, max, mean, stdev (usec) + Completion latency: min, max, mean, stdev (usec) + Completion latency percentiles: 20 fields (see below) + Total latency: min, max, mean, stdev (usec) + Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5] + IOPS [v5]: min, max, mean, stdev, number of samples +.fi .RS -.B Xth percentile=usec +.P +.B +TRIM status [all but version 3]: .RE -Total latency: +.P +.nf + Fields are similar to \fBREAD/WRITE\fR status. +.fi .RS -.B min, max, mean, standard deviation +.P +.B +CPU usage: .RE -Bandwidth: +.P +.nf + user, system, context switches, major faults, minor faults +.fi .RS -.B min, max, aggregate percentage of total, mean, standard deviation -.RE +.P +.B +I/O depths: .RE .P -CPU usage: +.nf + <=1, 2, 4, 8, 16, 32, >=64 +.fi .RS -.B user, system, context switches, major page faults, minor page faults +.P +.B +I/O latencies microseconds: .RE .P -IO depth distribution: +.nf + <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000 +.fi .RS -.B <=1, 2, 4, 8, 16, 32, >=64 +.P +.B +I/O latencies milliseconds: .RE .P -IO latency distribution: -.RS -Microseconds: +.nf + <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000 +.fi .RS -.B <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000 +.P +.B +Disk utilization [v3]: .RE -Milliseconds: +.P +.nf + disk name, read ios, write ios, read merges, write merges, read ticks, write ticks, time spent in queue, disk utilization percentage +.fi .RS -.B <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000 -.RE +.P +.B +Additional Info (dependent on continue_on_error, default off): .RE .P -Disk utilization (1 for each disk used): +.nf + total # errors, first error code +.fi .RS -.B name, read ios, write ios, read merges, write merges, read ticks, write ticks, read in-queue time, write in-queue time, disk utilization percentage +.P +.B +Additional Info (dependent on description being set): .RE .P -Error Info (dependent on continue_on_error, default off): +.nf + Text description +.fi +.P +Completion latency percentiles can be a grouping of up to 20 sets, so for the +terse output fio writes all of them. Each field will look like this: +.P +.nf + 1.00%=6112 +.fi +.P +which is the Xth percentile, and the `usec' latency associated with it. +.P +For \fBDisk utilization\fR, all disks used by fio are shown. So for each disk there +will be a disk utilization section. +.P +Below is a single line containing short names for each of the fields in the +minimal output v3, separated by semicolons: +.P +.nf + terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util +.fi +.SH JSON OUTPUT +The \fBjson\fR output format is intended to be both human readable and convenient +for automated parsing. For the most part its sections mirror those of the +\fBnormal\fR output. The \fBruntime\fR value is reported in msec and the \fBbw\fR value is +reported in 1024 bytes per second units. +.fi +.SH JSON+ OUTPUT +The \fBjson+\fR output format is identical to the \fBjson\fR output format except that it +adds a full dump of the completion latency bins. Each \fBbins\fR object contains a +set of (key, value) pairs where keys are latency durations and values count how +many I/Os had completion latencies of the corresponding duration. For example, +consider: .RS -.B total # errors, first error code -.RE .P -.B text description (if provided in config - appears on newline) +"bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... } .RE +.P +This data indicates that one I/O required 87,552ns to complete, two I/Os required +100,864ns to complete, and 7529 I/Os required 107,008ns to complete. +.P +Also included with fio is a Python script \fBfio_jsonplus_clat2csv\fR that takes +json+ output and generates CSV\-formatted latency data suitable for plotting. +.P +The latency durations actually represent the midpoints of latency intervals. +For details refer to `stat.h' in the fio source. .SH TRACE FILE FORMAT -There are two trace file format that you can encounter. The older (v1) format -is unsupported since version 1.20-rc3 (March 2008). It will still be described +There are two trace file format that you can encounter. The older (v1) format is +unsupported since version 1.20\-rc3 (March 2008). It will still be described below in case that you get an old trace and want to understand it. - -In any case the trace is a simple text file with a single action per line. - .P +In any case the trace is a simple text file with a single action per line. +.TP .B Trace file format v1 +Each line represents a single I/O action in the following format: .RS -Each line represents a single io action in the following format: - +.RS +.P rw, offset, length - -where rw=0/1 for read/write, and the offset and length entries being in bytes. - -This format is not supported in Fio versions => 1.20-rc3. - .RE .P +where `rw=0/1' for read/write, and the `offset' and `length' entries being in bytes. +.P +This format is not supported in fio versions >= 1.20\-rc3. +.RE +.TP .B Trace file format v2 +The second version of the trace file format was added in fio version 1.17. It +allows to access more then one file per trace and has a bigger set of possible +file actions. .RS -The second version of the trace file format was added in Fio version 1.17. -It allows one to access more then one file per trace and has a bigger set of -possible file actions. - +.P The first line of the trace file has to be: - -\fBfio version 2 iolog\fR - +.RS +.P +"fio version 2 iolog" +.RE +.P Following this can be lines in two different formats, which are described below. +.P +.B The file management format: - -\fBfilename action\fR - -The filename is given as an absolute path. The action can be one of these: - +.RS +filename action .P -.PD 0 +The `filename' is given as an absolute path. The `action' can be one of these: .RS .TP .B add -Add the given filename to the trace +Add the given `filename' to the trace. .TP .B open -Open the file with the given filename. The filename has to have been previously -added with the \fBadd\fR action. +Open the file with the given `filename'. The `filename' has to have +been added with the \fBadd\fR action before. .TP .B close -Close the file with the given filename. The file must have previously been -opened. +Close the file with the given `filename'. The file has to have been +\fBopen\fRed before. +.RE .RE -.PD .P - -The file io action format: - -\fBfilename action offset length\fR - -The filename is given as an absolute path, and has to have been added and opened -before it can be used with this format. The offset and length are given in -bytes. The action can be one of these: - +.B +The file I/O action format: +.RS +filename action offset length .P -.PD 0 +The `filename' is given as an absolute path, and has to have been \fBadd\fRed and +\fBopen\fRed before it can be used with this format. The `offset' and `length' are +given in bytes. The `action' can be one of these: .RS .TP .B wait -Wait for 'offset' microseconds. Everything below 100 is discarded. The time is -relative to the previous wait statement. +Wait for `offset' microseconds. Everything below 100 is discarded. +The time is relative to the previous `wait' statement. .TP .B read -Read \fBlength\fR bytes beginning from \fBoffset\fR +Read `length' bytes beginning from `offset'. .TP .B write -Write \fBlength\fR bytes beginning from \fBoffset\fR +Write `length' bytes beginning from `offset'. .TP .B sync -fsync() the file +\fBfsync\fR\|(2) the file. .TP .B datasync -fdatasync() the file +\fBfdatasync\fR\|(2) the file. .TP .B trim -trim the given file from the given \fBoffset\fR for \fBlength\fR bytes +Trim the given file from the given `offset' for `length' bytes. +.RE .RE -.PD -.P - .SH CPU IDLENESS PROFILING -In some cases, we want to understand CPU overhead in a test. For example, -we test patches for the specific goodness of whether they reduce CPU usage. -fio implements a balloon approach to create a thread per CPU that runs at -idle priority, meaning that it only runs when nobody else needs the cpu. -By measuring the amount of work completed by the thread, idleness of each -CPU can be derived accordingly. - -An unit work is defined as touching a full page of unsigned characters. Mean -and standard deviation of time to complete an unit work is reported in "unit -work" section. Options can be chosen to report detailed percpu idleness or -overall system idleness by aggregating percpu stats. - +In some cases, we want to understand CPU overhead in a test. For example, we +test patches for the specific goodness of whether they reduce CPU usage. +Fio implements a balloon approach to create a thread per CPU that runs at idle +priority, meaning that it only runs when nobody else needs the cpu. +By measuring the amount of work completed by the thread, idleness of each CPU +can be derived accordingly. +.P +An unit work is defined as touching a full page of unsigned characters. Mean and +standard deviation of time to complete an unit work is reported in "unit work" +section. Options can be chosen to report detailed percpu idleness or overall +system idleness by aggregating percpu stats. .SH VERIFICATION AND TRIGGERS -Fio is usually run in one of two ways, when data verification is done. The -first is a normal write job of some sort with verify enabled. When the -write phase has completed, fio switches to reads and verifies everything -it wrote. The second model is running just the write phase, and then later -on running the same job (but with reads instead of writes) to repeat the -same IO patterns and verify the contents. Both of these methods depend -on the write phase being completed, as fio otherwise has no idea how much -data was written. - -With verification triggers, fio supports dumping the current write state -to local files. Then a subsequent read verify workload can load this state -and know exactly where to stop. This is useful for testing cases where -power is cut to a server in a managed fashion, for instance. - +Fio is usually run in one of two ways, when data verification is done. The first +is a normal write job of some sort with verify enabled. When the write phase has +completed, fio switches to reads and verifies everything it wrote. The second +model is running just the write phase, and then later on running the same job +(but with reads instead of writes) to repeat the same I/O patterns and verify +the contents. Both of these methods depend on the write phase being completed, +as fio otherwise has no idea how much data was written. +.P +With verification triggers, fio supports dumping the current write state to +local files. Then a subsequent read verify workload can load this state and know +exactly where to stop. This is useful for testing cases where power is cut to a +server in a managed fashion, for instance. +.P A verification trigger consists of two things: - .RS -Storing the write state of each job -.LP -Executing a trigger command +.P +1) Storing the write state of each job. +.P +2) Executing a trigger command. .RE - -The write state is relatively small, on the order of hundreds of bytes -to single kilobytes. It contains information on the number of completions -done, the last X completions, etc. - -A trigger is invoked either through creation (\fBtouch\fR) of a specified -file in the system, or through a timeout setting. If fio is run with -\fB\-\-trigger\-file=/tmp/trigger-file\fR, then it will continually check for -the existence of /tmp/trigger-file. When it sees this file, it will -fire off the trigger (thus saving state, and executing the trigger +.P +The write state is relatively small, on the order of hundreds of bytes to single +kilobytes. It contains information on the number of completions done, the last X +completions, etc. +.P +A trigger is invoked either through creation ('touch') of a specified file in +the system, or through a timeout setting. If fio is run with +`\-\-trigger\-file=/tmp/trigger\-file', then it will continually +check for the existence of `/tmp/trigger\-file'. When it sees this file, it +will fire off the trigger (thus saving state, and executing the trigger command). - -For client/server runs, there's both a local and remote trigger. If -fio is running as a server backend, it will send the job states back -to the client for safe storage, then execute the remote trigger, if -specified. If a local trigger is specified, the server will still send -back the write state, but the client will then execute the trigger. - +.P +For client/server runs, there's both a local and remote trigger. If fio is +running as a server backend, it will send the job states back to the client for +safe storage, then execute the remote trigger, if specified. If a local trigger +is specified, the server will still send back the write state, but the client +will then execute the trigger. .RE .P .B Verification trigger example .RS - -Lets say we want to run a powercut test on the remote machine 'server'. -Our write workload is in write-test.fio. We want to cut power to 'server' -at some point during the run, and we'll run this test from the safety -or our local machine, 'localbox'. On the server, we'll start the fio -backend normally: - -server# \fBfio \-\-server\fR - +Let's say we want to run a powercut test on the remote Linux machine 'server'. +Our write workload is in `write\-test.fio'. We want to cut power to 'server' at +some point during the run, and we'll run this test from the safety or our local +machine, 'localbox'. On the server, we'll start the fio backend normally: +.RS +.P +server# fio \-\-server +.RE +.P and on the client, we'll fire off the workload: - -localbox$ \fBfio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger-remote="bash \-c "echo b > /proc/sysrq-triger""\fR - -We set \fB/tmp/my-trigger\fR as the trigger file, and we tell fio to execute - -\fBecho b > /proc/sysrq-trigger\fR - -on the server once it has received the trigger and sent us the write -state. This will work, but it's not \fIreally\fR cutting power to the server, -it's merely abruptly rebooting it. If we have a remote way of cutting -power to the server through IPMI or similar, we could do that through -a local trigger command instead. Lets assume we have a script that does -IPMI reboot of a given hostname, ipmi-reboot. On localbox, we could -then have run fio with a local trigger instead: - -localbox$ \fBfio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger="ipmi-reboot server"\fR - -For this case, fio would wait for the server to send us the write state, -then execute 'ipmi-reboot server' when that happened. - +.RS +.P +localbox$ fio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger\-remote="bash \-c "echo b > /proc/sysrq\-triger"" +.RE +.P +We set `/tmp/my\-trigger' as the trigger file, and we tell fio to execute: +.RS +.P +echo b > /proc/sysrq\-trigger +.RE +.P +on the server once it has received the trigger and sent us the write state. This +will work, but it's not really cutting power to the server, it's merely +abruptly rebooting it. If we have a remote way of cutting power to the server +through IPMI or similar, we could do that through a local trigger command +instead. Let's assume we have a script that does IPMI reboot of a given hostname, +ipmi\-reboot. On localbox, we could then have run fio with a local trigger +instead: +.RS +.P +localbox$ fio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger="ipmi\-reboot server" +.RE +.P +For this case, fio would wait for the server to send us the write state, then +execute `ipmi\-reboot server' when that happened. .RE .P .B Loading verify state .RS -To load store write state, read verification job file must contain -the verify_state_load option. If that is set, fio will load the previously +To load stored write state, a read verification job file must contain the +\fBverify_state_load\fR option. If that is set, fio will load the previously stored state. For a local fio run this is done by loading the files directly, -and on a client/server run, the server backend will ask the client to send -the files over and load them from there. - +and on a client/server run, the server backend will ask the client to send the +files over and load them from there. .RE - .SH LOG FILE FORMATS - Fio supports a variety of log file formats, for logging latencies, bandwidth, and IOPS. The logs share a common format, which looks like this: - -.B time (msec), value, data direction, offset - -Time for the log entry is always in milliseconds. The value logged depends -on the type of log, it will be one of the following: - +.RS .P -.PD 0 +time (msec), value, data direction, block size (bytes), offset (bytes) +.RE +.P +`Time' for the log entry is always in milliseconds. The `value' logged depends +on the type of log, it will be one of the following: +.RS .TP .B Latency log -Value is in latency in usecs +Value is latency in nsecs .TP .B Bandwidth log -Value is in KB/sec +Value is in KiB/sec .TP .B IOPS log -Value is in IOPS -.PD -.P - -Data direction is one of the following: - +Value is IOPS +.RE .P -.PD 0 +`Data direction' is one of the following: +.RS .TP .B 0 -IO is a READ +I/O is a READ .TP .B 1 -IO is a WRITE +I/O is a WRITE .TP .B 2 -IO is a TRIM -.PD -.P - -The \fIoffset\fR is the offset, in bytes, from the start of the file, for that -particular IO. The logging of the offset can be toggled with \fBlog_offset\fR. - -If windowed logging is enabled through \fBlog_avg_msec\fR, then fio doesn't log -individual IOs. Instead of logs the average values over the specified -period of time. Since \fIdata direction\fR and \fIoffset\fR are per-IO values, -they aren't applicable if windowed logging is enabled. If windowed logging -is enabled and \fBlog_max_value\fR is set, then fio logs maximum values in -that window instead of averages. - -For histogram logging the logs look like this: - -.B time (msec), data direction, block-size, bin 0, bin 1, ..., bin 1215 - -Where 'bin i' gives the frequency of IO requests with a latency falling in -the i-th bin. See \fBlog_hist_coarseness\fR for logging fewer bins. - +I/O is a TRIM .RE - +.P +The entry's `block size' is always in bytes. The `offset' is the offset, in bytes, +from the start of the file, for that particular I/O. The logging of the offset can be +toggled with \fBlog_offset\fR. +.P +Fio defaults to logging every individual I/O. When IOPS are logged for individual +I/Os the `value' entry will always be 1. If windowed logging is enabled through +\fBlog_avg_msec\fR, fio logs the average values over the specified period of time. +If windowed logging is enabled and \fBlog_max_value\fR is set, then fio logs +maximum values in that window instead of averages. Since `data direction', `block size' +and `offset' are per\-I/O values, if windowed logging is enabled they +aren't applicable and will be 0. .SH CLIENT / SERVER -Normally you would run fio as a stand-alone application on the machine -where the IO workload should be generated. However, it is also possible to -run the frontend and backend of fio separately. This makes it possible to -have a fio server running on the machine(s) where the IO workload should -be running, while controlling it from another machine. - -To start the server, you would do: - -\fBfio \-\-server=args\fR - -on that machine, where args defines what fio listens to. The arguments -are of the form 'type:hostname or IP:port'. 'type' is either 'ip' (or ip4) -for TCP/IP v4, 'ip6' for TCP/IP v6, or 'sock' for a local unix domain -socket. 'hostname' is either a hostname or IP address, and 'port' is the port to -listen to (only valid for TCP/IP, not a local socket). Some examples: - +Normally fio is invoked as a stand\-alone application on the machine where the +I/O workload should be generated. However, the backend and frontend of fio can +be run separately i.e., the fio server can generate an I/O workload on the "Device +Under Test" while being controlled by a client on another machine. +.P +Start the server on the machine which has access to the storage DUT: +.RS +.P +$ fio \-\-server=args +.RE +.P +where `args' defines what fio listens to. The arguments are of the form +`type,hostname' or `IP,port'. `type' is either `ip' (or ip4) for TCP/IP +v4, `ip6' for TCP/IP v6, or `sock' for a local unix domain socket. +`hostname' is either a hostname or IP address, and `port' is the port to listen +to (only valid for TCP/IP, not a local socket). Some examples: +.RS +.TP 1) \fBfio \-\-server\fR - - Start a fio server, listening on all interfaces on the default port (8765). - +Start a fio server, listening on all interfaces on the default port (8765). +.TP 2) \fBfio \-\-server=ip:hostname,4444\fR - - Start a fio server, listening on IP belonging to hostname and on port 4444. - +Start a fio server, listening on IP belonging to hostname and on port 4444. +.TP 3) \fBfio \-\-server=ip6:::1,4444\fR - - Start a fio server, listening on IPv6 localhost ::1 and on port 4444. - +Start a fio server, listening on IPv6 localhost ::1 and on port 4444. +.TP 4) \fBfio \-\-server=,4444\fR - - Start a fio server, listening on all interfaces on port 4444. - +Start a fio server, listening on all interfaces on port 4444. +.TP 5) \fBfio \-\-server=1.2.3.4\fR - - Start a fio server, listening on IP 1.2.3.4 on the default port. - +Start a fio server, listening on IP 1.2.3.4 on the default port. +.TP 6) \fBfio \-\-server=sock:/tmp/fio.sock\fR - - Start a fio server, listening on the local socket /tmp/fio.sock. - -When a server is running, you can connect to it from a client. The client -is run with: - -\fBfio \-\-local-args \-\-client=server \-\-remote-args \fR - -where \-\-local-args are arguments that are local to the client where it is -running, 'server' is the connect string, and \-\-remote-args and -are sent to the server. The 'server' string follows the same format as it -does on the server side, to allow IP/hostname/socket and port strings. -You can connect to multiple clients as well, to do that you could run: - -\fBfio \-\-client=server2 \-\-client=server2 \fR - -If the job file is located on the fio server, then you can tell the server -to load a local file as well. This is done by using \-\-remote-config: - -\fBfio \-\-client=server \-\-remote-config /path/to/file.fio\fR - -Then fio will open this local (to the server) job file instead -of being passed one from the client. - +Start a fio server, listening on the local socket `/tmp/fio.sock'. +.RE +.P +Once a server is running, a "client" can connect to the fio server with: +.RS +.P +$ fio \-\-client= +.RE +.P +where `local\-args' are arguments for the client where it is running, `server' +is the connect string, and `remote\-args' and `job file(s)' are sent to the +server. The `server' string follows the same format as it does on the server +side, to allow IP/hostname/socket and port strings. +.P +Fio can connect to multiple servers this way: +.RS +.P +$ fio \-\-client= \-\-client= +.RE +.P +If the job file is located on the fio server, then you can tell the server to +load a local file as well. This is done by using \fB\-\-remote\-config\fR: +.RS +.P +$ fio \-\-client=server \-\-remote\-config /path/to/file.fio +.RE +.P +Then fio will open this local (to the server) job file instead of being passed +one from the client. +.P If you have many servers (example: 100 VMs/containers), you can input a pathname -of a file containing host IPs/names as the parameter value for the \-\-client option. -For example, here is an example "host.list" file containing 2 hostnames: - +of a file containing host IPs/names as the parameter value for the +\fB\-\-client\fR option. For example, here is an example `host.list' +file containing 2 hostnames: +.RS +.P +.PD 0 host1.your.dns.domain -.br +.P host2.your.dns.domain - +.PD +.RE +.P The fio command would then be: - -\fBfio \-\-client=host.list \fR - -In this mode, you cannot input server-specific parameters or job files, and all +.RS +.P +$ fio \-\-client=host.list +.RE +.P +In this mode, you cannot input server\-specific parameters or job files \-\- all servers receive the same job file. - -In order to enable fio \-\-client runs utilizing a shared filesystem from multiple hosts, -fio \-\-client now prepends the IP address of the server to the filename. For example, -if fio is using directory /mnt/nfs/fio and is writing filename fileio.tmp, -with a \-\-client hostfile -containing two hostnames h1 and h2 with IP addresses 192.168.10.120 and 192.168.10.121, then -fio will create two files: - +.P +In order to let `fio \-\-client' runs use a shared filesystem from multiple +hosts, `fio \-\-client' now prepends the IP address of the server to the +filename. For example, if fio is using the directory `/mnt/nfs/fio' and is +writing filename `fileio.tmp', with a \fB\-\-client\fR `hostfile' +containing two hostnames `h1' and `h2' with IP addresses 192.168.10.120 and +192.168.10.121, then fio will create two files: +.RS +.P +.PD 0 /mnt/nfs/fio/192.168.10.120.fileio.tmp -.br +.P /mnt/nfs/fio/192.168.10.121.fileio.tmp - +.PD +.RE .SH AUTHORS - .B fio was written by Jens Axboe , now Jens Axboe . .br This man page was written by Aaron Carroll based on documentation by Jens Axboe. +.br +This man page was rewritten by Tomohiro Kusumi based +on documentation by Jens Axboe. .SH "REPORTING BUGS" Report bugs to the \fBfio\fR mailing list . -See \fBREADME\fR. +.br +See \fBREPORTING\-BUGS\fR. +.P +\fBREPORTING\-BUGS\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/REPORTING\-BUGS\fR .SH "SEE ALSO" For further documentation see \fBHOWTO\fR and \fBREADME\fR. .br -Sample jobfiles are available in the \fBexamples\fR directory. +Sample jobfiles are available in the `examples/' directory. +.br +These are typically located under `/usr/share/doc/fio'. +.P +\fBHOWTO\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/HOWTO\fR +.br +\fBREADME\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/README\fR diff -Nru fio-2.16/fio.h fio-3.1/fio.h --- fio-2.16/fio.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/fio.h 2017-09-28 10:23:20.000000000 +0000 @@ -25,7 +25,7 @@ #include "debug.h" #include "file.h" #include "io_ddir.h" -#include "ioengine.h" +#include "ioengines.h" #include "iolog.h" #include "helpers.h" #include "options.h" @@ -35,10 +35,12 @@ #include "oslib/getopt.h" #include "lib/rand.h" #include "lib/rbtree.h" +#include "lib/num2str.h" #include "client.h" #include "server.h" #include "stat.h" #include "flow.h" +#include "io_u.h" #include "io_u_queue.h" #include "workqueue.h" #include "steadystate.h" @@ -57,6 +59,10 @@ #define MPOL_LOCAL MPOL_MAX #endif +#ifdef CONFIG_CUDA +#include +#endif + /* * offset generator types */ @@ -74,17 +80,20 @@ TD_F_VER_NONE = 1U << 5, TD_F_PROFILE_OPS = 1U << 6, TD_F_COMPRESS = 1U << 7, - TD_F_NOIO = 1U << 8, + TD_F_RESERVED = 1U << 8, /* not used */ TD_F_COMPRESS_LOG = 1U << 9, TD_F_VSTATE_SAVED = 1U << 10, TD_F_NEED_LOCK = 1U << 11, TD_F_CHILD = 1U << 12, TD_F_NO_PROGRESS = 1U << 13, TD_F_REGROW_LOGS = 1U << 14, + TD_F_MMAP_KEEP = 1U << 15, }; enum { FIO_RAND_BS_OFF = 0, + FIO_RAND_BS1_OFF, + FIO_RAND_BS2_OFF, FIO_RAND_VER_OFF, FIO_RAND_MIX_OFF, FIO_RAND_FILE_OFF, @@ -99,6 +108,8 @@ FIO_DEDUPE_OFF, FIO_RAND_POISSON_OFF, FIO_RAND_ZONE_OFF, + FIO_RAND_POISSON2_OFF, + FIO_RAND_POISSON3_OFF, FIO_RAND_NR_OFFS, }; @@ -121,7 +132,6 @@ * Per-thread/process specific data. Only used for the network client * for now. */ -struct sk_out; void sk_out_assign(struct sk_out *); void sk_out_drop(void); @@ -142,7 +152,7 @@ unsigned int thread_number; unsigned int subjob_number; unsigned int groupid; - struct thread_stat ts; + struct thread_stat ts __attribute__ ((aligned(8))); int client_type; @@ -158,10 +168,10 @@ struct thread_data *parent; uint64_t stat_io_bytes[DDIR_RWDIR_CNT]; - struct timeval bw_sample_time; + struct timespec bw_sample_time; uint64_t stat_io_blocks[DDIR_RWDIR_CNT]; - struct timeval iops_sample_time; + struct timespec iops_sample_time; volatile int update_rusage; struct fio_mutex *rusage_sem; @@ -205,11 +215,9 @@ void *iolog_buf; FILE *iolog_f; - char *sysfs_root; - unsigned long rand_seeds[FIO_RAND_NR_OFFS]; - struct frand_state bsrange_state; + struct frand_state bsrange_state[DDIR_RWDIR_CNT]; struct frand_state verify_state; struct frand_state trim_state; struct frand_state delay_state; @@ -233,6 +241,7 @@ * to any of the available IO engines. */ struct ioengine_ops *io_ops; + int io_ops_init; /* * IO engine private data and dlhandle. @@ -281,9 +290,9 @@ unsigned long rate_bytes[DDIR_RWDIR_CNT]; unsigned long rate_blocks[DDIR_RWDIR_CNT]; unsigned long long rate_io_issue_bytes[DDIR_RWDIR_CNT]; - struct timeval lastrate[DDIR_RWDIR_CNT]; - int64_t last_usec; - struct frand_state poisson_state; + struct timespec lastrate[DDIR_RWDIR_CNT]; + int64_t last_usec[DDIR_RWDIR_CNT]; + struct frand_state poisson_state[DDIR_RWDIR_CNT]; /* * Enforced rate submission/completion workqueue @@ -317,21 +326,21 @@ */ struct frand_state random_state; - struct timeval start; /* start of this loop */ - struct timeval epoch; /* time job was started */ + struct timespec start; /* start of this loop */ + struct timespec epoch; /* time job was started */ unsigned long long unix_epoch; /* Time job was started, unix epoch based. */ - struct timeval last_issue; + struct timespec last_issue; long time_offset; - struct timeval tv_cache; - struct timeval terminate_time; - unsigned int tv_cache_nr; - unsigned int tv_cache_mask; - unsigned int ramp_time_over; + struct timespec ts_cache; + struct timespec terminate_time; + unsigned int ts_cache_nr; + unsigned int ts_cache_mask; + bool ramp_time_over; /* * Time since last latency_window was started */ - struct timeval latency_ts; + struct timespec latency_ts; unsigned int latency_qd; unsigned int latency_qd_high; unsigned int latency_qd_low; @@ -406,6 +415,18 @@ struct steadystate_data ss; char verror[FIO_VERROR_SIZE]; + +#ifdef CONFIG_CUDA + /* + * for GPU memory management + */ + int gpu_dev_cnt; + int gpu_dev_id; + CUdevice cu_dev; + CUcontext cu_ctx; + CUdeviceptr dev_mem_ptr; +#endif + }; /* @@ -492,7 +513,7 @@ { if (td->last_was_sync) return 0; - if (td_write(td) || td_rw(td) || td->o.override_sync) + if (td_write(td) || td->o.override_sync) return 1; return 0; @@ -518,11 +539,9 @@ extern void fio_options_set_ioengine_opts(struct option *long_options, struct thread_data *td); extern void fio_options_dup_and_init(struct option *); extern void fio_options_mem_dupe(struct thread_data *); -extern void options_mem_dupe(void *data, struct fio_option *options); extern void td_fill_rand_seeds(struct thread_data *); extern void td_fill_verify_state_seed(struct thread_data *); extern void add_job_opts(const char **, int); -extern char *num2str(uint64_t, int, int, int, int); extern int ioengine_load(struct thread_data *); extern bool parse_dryrun(void); extern int fio_running_or_pending_io_threads(void); @@ -580,7 +599,8 @@ static inline void td_set_ioengine_flags(struct thread_data *td) { - td->flags |= (td->io_ops->flags << TD_ENG_FLAG_SHIFT); + td->flags = (~(TD_ENG_FLAG_MASK << TD_ENG_FLAG_SHIFT) & td->flags) | + (td->io_ops->flags << TD_ENG_FLAG_SHIFT); } static inline bool td_ioengine_flagged(struct thread_data *td, @@ -613,22 +633,19 @@ extern void free_io_mem(struct thread_data *); extern void free_threads_shm(void); +#ifdef FIO_INTERNAL +#define PTR_ALIGN(ptr, mask) \ + (char *) (((uintptr_t) (ptr) + (mask)) & ~(mask)) +#endif + /* * Reset stats after ramp time completes */ extern void reset_all_stats(struct thread_data *); -/* - * blktrace support - */ -#ifdef FIO_HAVE_BLKTRACE -extern int is_blktrace(const char *, int *); -extern int load_blktrace(struct thread_data *, const char *, int); -#endif - extern int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret, enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify, - struct timeval *comp_time); + struct timespec *comp_time); /* * Latency target helpers @@ -637,6 +654,9 @@ extern void lat_target_init(struct thread_data *); extern void lat_target_reset(struct thread_data *); +/* + * Iterates all threads/processes within all the defined jobs + */ #define for_each_td(td, i) \ for ((i) = 0, (td) = &threads[0]; (i) < (int) thread_number; (i)++, (td)++) #define for_each_file(td, f, i) \ diff -Nru fio-2.16/fio_time.h fio-3.1/fio_time.h --- fio-2.16/fio_time.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/fio_time.h 2017-09-28 10:23:20.000000000 +0000 @@ -4,22 +4,24 @@ #include "lib/types.h" struct thread_data; -extern uint64_t utime_since(const struct timeval *,const struct timeval *); -extern uint64_t utime_since_now(const struct timeval *); -extern uint64_t mtime_since(const struct timeval *, const struct timeval *); -extern uint64_t mtime_since_now(const struct timeval *); -extern uint64_t time_since_now(const struct timeval *); +extern uint64_t ntime_since(const struct timespec *, const struct timespec *); +extern uint64_t utime_since(const struct timespec *, const struct timespec *); +extern uint64_t utime_since_now(const struct timespec *); +extern uint64_t mtime_since(const struct timespec *, const struct timespec *); +extern uint64_t mtime_since_now(const struct timespec *); +extern uint64_t mtime_since_tv(const struct timeval *, const struct timeval *); +extern uint64_t time_since_now(const struct timespec *); extern uint64_t time_since_genesis(void); extern uint64_t mtime_since_genesis(void); extern uint64_t utime_since_genesis(void); extern uint64_t usec_spin(unsigned int); extern uint64_t usec_sleep(struct thread_data *, unsigned long); -extern void fill_start_time(struct timeval *); +extern void fill_start_time(struct timespec *); extern void set_genesis_time(void); extern bool ramp_time_over(struct thread_data *); extern bool in_ramp_time(struct thread_data *); extern void fio_time_init(void); -extern void timeval_add_msec(struct timeval *, unsigned int); +extern void timespec_add_msec(struct timespec *, unsigned int); extern void set_epoch_time(struct thread_data *, int); #endif diff -Nru fio-2.16/FIO-VERSION-GEN fio-3.1/FIO-VERSION-GEN --- fio-2.16/FIO-VERSION-GEN 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/FIO-VERSION-GEN 2017-09-28 10:23:20.000000000 +0000 @@ -1,7 +1,7 @@ #!/bin/sh GVF=FIO-VERSION-FILE -DEF_VER=fio-2.16 +DEF_VER=fio-3.1 LF=' ' diff -Nru fio-2.16/flist.h fio-3.1/flist.h --- fio-2.16/flist.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/flist.h 2017-09-28 10:23:20.000000000 +0000 @@ -2,13 +2,7 @@ #define _LINUX_FLIST_H #include - -#undef offsetof -#ifdef __compiler_offsetof -#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER) -#else -#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) -#endif +#include #define container_of(ptr, type, member) ({ \ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ diff -Nru fio-2.16/gclient.c fio-3.1/gclient.c --- fio-2.16/gclient.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/gclient.c 2017-09-28 10:23:20.000000000 +0000 @@ -48,7 +48,7 @@ { "PrintFile", GTK_STOCK_PRINT, "Print", "P", NULL, G_CALLBACK(results_print) }, { "CloseFile", GTK_STOCK_CLOSE, "Close", "W", NULL, G_CALLBACK(results_close) }, }; -static gint results_nmenu_items = sizeof(results_menu_items) / sizeof(results_menu_items[0]); +static gint results_nmenu_items = ARRAY_SIZE(results_menu_items); static const gchar *results_ui_string = " \ \ @@ -364,29 +364,11 @@ sprintf(tmp, "%u", je->files_open); gtk_entry_set_text(GTK_ENTRY(ge->eta.files), tmp); -#if 0 - if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) { - if (je->m_rate || je->t_rate) { - char *tr, *mr; - - mr = num2str(je->m_rate, 4, 0, i2p); - tr = num2str(je->t_rate, 4, 0, i2p); - gtk_entry_set_text(GTK_ENTRY(ge->eta); - p += sprintf(p, ", CR=%s/%s KB/s", tr, mr); - free(tr); - free(mr); - } else if (je->m_iops || je->t_iops) - p += sprintf(p, ", CR=%d/%d IOPS", je->t_iops, je->m_iops); - - gtk_entry_set_text(GTK_ENTRY(ge->eta.cr_bw), "---"); - gtk_entry_set_text(GTK_ENTRY(ge->eta.cr_iops), "---"); - gtk_entry_set_text(GTK_ENTRY(ge->eta.cw_bw), "---"); - gtk_entry_set_text(GTK_ENTRY(ge->eta.cw_iops), "---"); -#endif - if (je->eta_sec != INT_MAX && je->nr_running) { char *iops_str[DDIR_RWDIR_CNT]; char *rate_str[DDIR_RWDIR_CNT]; + char *rate_alt[DDIR_RWDIR_CNT]; + char tmp[128]; int i; if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running) @@ -397,19 +379,26 @@ sprintf(output, "%3.1f%% done", perc); } - rate_str[0] = num2str(je->rate[0], 5, 10, i2p, 0); - rate_str[1] = num2str(je->rate[1], 5, 10, i2p, 0); - rate_str[2] = num2str(je->rate[2], 5, 10, i2p, 0); - - iops_str[0] = num2str(je->iops[0], 4, 1, 0, 0); - iops_str[1] = num2str(je->iops[1], 4, 1, 0, 0); - iops_str[2] = num2str(je->iops[2], 4, 1, 0, 0); - - gtk_entry_set_text(GTK_ENTRY(ge->eta.read_bw), rate_str[0]); + iops_str[0] = num2str(je->iops[0], 4, 1, 0, N2S_PERSEC); + iops_str[1] = num2str(je->iops[1], 4, 1, 0, N2S_PERSEC); + iops_str[2] = num2str(je->iops[2], 4, 1, 0, N2S_PERSEC); + + rate_str[0] = num2str(je->rate[0], 4, 10, i2p, N2S_BYTEPERSEC); + rate_alt[0] = num2str(je->rate[0], 4, 10, !i2p, N2S_BYTEPERSEC); + snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[0], rate_alt[0]); + gtk_entry_set_text(GTK_ENTRY(ge->eta.read_bw), tmp); gtk_entry_set_text(GTK_ENTRY(ge->eta.read_iops), iops_str[0]); - gtk_entry_set_text(GTK_ENTRY(ge->eta.write_bw), rate_str[1]); + + rate_str[1] = num2str(je->rate[1], 4, 10, i2p, N2S_BYTEPERSEC); + rate_alt[1] = num2str(je->rate[1], 4, 10, !i2p, N2S_BYTEPERSEC); + snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[1], rate_alt[1]); + gtk_entry_set_text(GTK_ENTRY(ge->eta.write_bw), tmp); gtk_entry_set_text(GTK_ENTRY(ge->eta.write_iops), iops_str[1]); - gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_bw), rate_str[2]); + + rate_str[2] = num2str(je->rate[2], 4, 10, i2p, N2S_BYTEPERSEC); + rate_alt[2] = num2str(je->rate[2], 4, 10, !i2p, N2S_BYTEPERSEC); + snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[2], rate_alt[2]); + gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_bw), tmp); gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_iops), iops_str[2]); graph_add_xy_data(ge->graphs.iops_graph, ge->graphs.read_iops, je->elapsed_sec, je->iops[0], iops_str[0]); @@ -421,6 +410,7 @@ for (i = 0; i < DDIR_RWDIR_CNT; i++) { free(rate_str[i]); + free(rate_alt[i]); free(iops_str[i]); } } @@ -457,31 +447,13 @@ eta_to_str(eta_str, je->eta_sec); } -#if 0 - if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) { - if (je->m_rate || je->t_rate) { - char *tr, *mr; - - mr = num2str(je->m_rate, 4, 0, i2p); - tr = num2str(je->t_rate, 4, 0, i2p); - gtk_entry_set_text(GTK_ENTRY(ui->eta); - p += sprintf(p, ", CR=%s/%s KB/s", tr, mr); - free(tr); - free(mr); - } else if (je->m_iops || je->t_iops) - p += sprintf(p, ", CR=%d/%d IOPS", je->t_iops, je->m_iops); - - gtk_entry_set_text(GTK_ENTRY(ui->eta.cr_bw), "---"); - gtk_entry_set_text(GTK_ENTRY(ui->eta.cr_iops), "---"); - gtk_entry_set_text(GTK_ENTRY(ui->eta.cw_bw), "---"); - gtk_entry_set_text(GTK_ENTRY(ui->eta.cw_iops), "---"); -#endif - entry_set_int_value(ui->eta.jobs, je->nr_running); if (je->eta_sec != INT_MAX && je->nr_running) { - char *iops_str[3]; - char *rate_str[3]; + char *iops_str[DDIR_RWDIR_CNT]; + char *rate_str[DDIR_RWDIR_CNT]; + char *rate_alt[DDIR_RWDIR_CNT]; + char tmp[128]; if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running) strcpy(output, "-.-% done"); @@ -491,19 +463,26 @@ sprintf(output, "%3.1f%% done", perc); } - rate_str[0] = num2str(je->rate[0], 5, 10, i2p, 0); - rate_str[1] = num2str(je->rate[1], 5, 10, i2p, 0); - rate_str[2] = num2str(je->rate[2], 5, 10, i2p, 0); - - iops_str[0] = num2str(je->iops[0], 4, 1, 0, 0); - iops_str[1] = num2str(je->iops[1], 4, 1, 0, 0); - iops_str[2] = num2str(je->iops[2], 4, 1, 0, 0); - - gtk_entry_set_text(GTK_ENTRY(ui->eta.read_bw), rate_str[0]); + iops_str[0] = num2str(je->iops[0], 4, 1, 0, N2S_PERSEC); + iops_str[1] = num2str(je->iops[1], 4, 1, 0, N2S_PERSEC); + iops_str[2] = num2str(je->iops[2], 4, 1, 0, N2S_PERSEC); + + rate_str[0] = num2str(je->rate[0], 4, 10, i2p, N2S_BYTEPERSEC); + rate_alt[0] = num2str(je->rate[0], 4, 10, !i2p, N2S_BYTEPERSEC); + snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[0], rate_alt[0]); + gtk_entry_set_text(GTK_ENTRY(ui->eta.read_bw), tmp); gtk_entry_set_text(GTK_ENTRY(ui->eta.read_iops), iops_str[0]); - gtk_entry_set_text(GTK_ENTRY(ui->eta.write_bw), rate_str[1]); + + rate_str[1] = num2str(je->rate[1], 4, 10, i2p, N2S_BYTEPERSEC); + rate_alt[1] = num2str(je->rate[1], 4, 10, !i2p, N2S_BYTEPERSEC); + snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[1], rate_alt[1]); + gtk_entry_set_text(GTK_ENTRY(ui->eta.write_bw), tmp); gtk_entry_set_text(GTK_ENTRY(ui->eta.write_iops), iops_str[1]); - gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_bw), rate_str[2]); + + rate_str[2] = num2str(je->rate[2], 4, 10, i2p, N2S_BYTEPERSEC); + rate_alt[2] = num2str(je->rate[2], 4, 10, !i2p, N2S_BYTEPERSEC); + snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[2], rate_alt[2]); + gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_bw), tmp); gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_iops), iops_str[2]); graph_add_xy_data(ui->graphs.iops_graph, ui->graphs.read_iops, je->elapsed_sec, je->iops[0], iops_str[0]); @@ -515,6 +494,7 @@ for (i = 0; i < DDIR_RWDIR_CNT; i++) { free(rate_str[i]); + free(rate_alt[i]); free(iops_str[i]); } } @@ -592,6 +572,7 @@ struct thread_options *o; char *c1, *c2, *c3, *c4; char tmp[80]; + int i2p; p->thread_number = le32_to_cpu(p->thread_number); p->groupid = le32_to_cpu(p->groupid); @@ -605,11 +586,13 @@ sprintf(tmp, "%s %s", o->odirect ? "direct" : "buffered", ddir_str(o->td_ddir)); multitext_add_entry(&ge->eta.iotype, tmp); - c1 = fio_uint_to_kmg(o->min_bs[DDIR_READ]); - c2 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]); - c3 = fio_uint_to_kmg(o->min_bs[DDIR_READ]); - c4 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]); - sprintf(tmp, "%s-%s/%s-%s", c1, c2, c3, c4); + i2p = is_power_of_2(o->kb_base); + c1 = num2str(o->min_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE); + c2 = num2str(o->max_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE); + c3 = num2str(o->min_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE); + c4 = num2str(o->max_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE); + + sprintf(tmp, "%s-%s,%s-%s", c1, c2, c3, c4); free(c1); free(c2); free(c3); @@ -947,18 +930,21 @@ static void gfio_show_latency_buckets(struct gfio_client *gc, GtkWidget *vbox, struct thread_stat *ts) { - double io_u_lat[FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR]; - const char *ranges[] = { "2u", "4u", "10u", "20u", "50u", "100u", - "250u", "500u", "750u", "1m", "2m", - "4m", "10m", "20m", "50m", "100m", - "250m", "500m", "750m", "1s", "2s", ">= 2s" }; + double io_u_lat[FIO_IO_U_LAT_N_NR + FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR]; + const char *ranges[] = { "2ns", "4ns", "10ns", "20ns", "50ns", "100ns", + "250ns", "500ns", "750ns", "1000ns", "2us", + "4us", "10us", "20us", "50us", "100us", + "250us", "500us", "750us", "1ms", "2ms", + "4ms", "10ms", "20ms", "50ms", "100ms", + "250ms", "500ms", "750ms", "1s", "2s", ">= 2s" }; int start, end, i; const int total = FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR; GtkWidget *frame, *tree_view, *hbox, *completion_vbox, *drawing_area; struct gui_entry *ge = gc->ge; - stat_calc_lat_u(ts, io_u_lat); - stat_calc_lat_m(ts, &io_u_lat[FIO_IO_U_LAT_U_NR]); + stat_calc_lat_n(ts, io_u_lat); + stat_calc_lat_u(ts, &io_u_lat[FIO_IO_U_LAT_N_NR]); + stat_calc_lat_m(ts, &io_u_lat[FIO_IO_U_LAT_N_NR + FIO_IO_U_LAT_U_NR]); /* * Found out which first bucket has entries, and which last bucket @@ -980,7 +966,7 @@ return; tree_view = gfio_output_lat_buckets(&io_u_lat[start], &ranges[start], end - start + 1); - ge->lat_bucket_graph = setup_lat_bucket_graph("Latency Buckets", &io_u_lat[start], &ranges[start], end - start + 1, 700.0, 300.0); + ge->lat_bucket_graph = setup_lat_bucket_graph("Latency buckets", &io_u_lat[start], &ranges[start], end - start + 1, 700.0, 300.0); frame = gtk_frame_new("Latency buckets"); gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5); @@ -1000,19 +986,21 @@ gtk_box_pack_start(GTK_BOX(hbox), tree_view, TRUE, TRUE, 3); } -static void gfio_show_lat(GtkWidget *vbox, const char *name, unsigned long min, - unsigned long max, double mean, double dev) +static void gfio_show_lat(GtkWidget *vbox, const char *name, unsigned long long min, + unsigned long long max, double mean, double dev) { - const char *base = "(usec)"; + const char *base = "(nsec)"; GtkWidget *hbox, *label, *frame; char *minp, *maxp; char tmp[64]; - if (usec_to_msec(&min, &max, &mean, &dev)) + if (nsec_to_msec(&min, &max, &mean, &dev)) base = "(msec)"; + else if (nsec_to_usec(&min, &max, &mean, &dev)) + base = "(usec)"; - minp = num2str(min, 6, 1, 0, 0); - maxp = num2str(max, 6, 1, 0, 0); + minp = num2str(min, 6, 1, 0, N2S_NONE); + maxp = num2str(max, 6, 1, 0, N2S_NONE); sprintf(tmp, "%s %s", name, base); frame = gtk_frame_new(tmp); @@ -1036,7 +1024,7 @@ free(maxp); } -static GtkWidget *gfio_output_clat_percentiles(unsigned int *ovals, +static GtkWidget *gfio_output_clat_percentiles(unsigned long long *ovals, fio_fp64_t *plist, unsigned int len, const char *base, @@ -1047,10 +1035,10 @@ GtkTreeSelection *selection; GtkListStore *model; GtkTreeIter iter; - int i; + int i, j; for (i = 0; i < len; i++) - types[i] = G_TYPE_INT; + types[i] = G_TYPE_ULONG; model = gtk_list_store_newv(len, types); @@ -1073,15 +1061,15 @@ gtk_list_store_append(model, &iter); for (i = 0; i < len; i++) { - if (scale) + for (j = 0; j < scale; j++) ovals[i] = (ovals[i] + 999) / 1000; - gtk_list_store_set(model, &iter, i, ovals[i], -1); + gtk_list_store_set(model, &iter, i, (unsigned long) ovals[i], -1); } return tree_view; } -static struct graph *setup_clat_graph(char *title, unsigned int *ovals, +static struct graph *setup_clat_graph(char *title, unsigned long long *ovals, fio_fp64_t *plist, unsigned int len, double xdim, double ydim) @@ -1113,7 +1101,8 @@ unsigned int *io_u_plat = ts->io_u_plat[ddir]; unsigned long nr = ts->clat_stat[ddir].samples; fio_fp64_t *plist = ts->percentile_list; - unsigned int *ovals, len, minv, maxv, scale_down; + unsigned int len, scale_down; + unsigned long long *ovals, minv, maxv; const char *base; GtkWidget *tree_view, *frame, *hbox, *drawing_area, *completion_vbox; struct gui_entry *ge = gc->ge; @@ -1124,18 +1113,25 @@ goto out; /* - * We default to usecs, but if the value range is such that we - * should scale down to msecs, do that. + * We default to nsecs, but if the value range is such that we + * should scale down to usecs or msecs, do that. */ - if (minv > 2000 && maxv > 99999) { - scale_down = 1; + if (minv > 2000000 && maxv > 99999999ULL) { + scale_down = 2; base = "msec"; - } else { - scale_down = 0; + } else if (minv > 2000 && maxv > 99999) { + scale_down = 1; base = "usec"; - } + } else { + scale_down = 0; + base = "nsec"; + } + + if (ts->clat_percentiles) + sprintf(tmp, "Completion percentiles (%s)", base); + else + sprintf(tmp, "Latency percentiles (%s)", base); - sprintf(tmp, "Completion percentiles (%s)", base); tree_view = gfio_output_clat_percentiles(ovals, plist, len, base, scale_down); ge->clat_graph = setup_clat_graph(tmp, ovals, plist, len, 700.0, 300.0); @@ -1169,11 +1165,13 @@ { const char *ddir_label[3] = { "Read", "Write", "Trim" }; GtkWidget *frame, *label, *box, *vbox, *main_vbox; - unsigned long min[3], max[3], runt; + unsigned long long min[3], max[3]; + unsigned long runt; unsigned long long bw, iops; unsigned int flags = 0; double mean[3], dev[3]; - char *io_p, *bw_p, *iops_p; + char *io_p, *io_palt, *bw_p, *bw_palt, *iops_p; + char tmp[128]; int i2p; if (!ts->runtime[ddir]) @@ -1183,11 +1181,9 @@ runt = ts->runtime[ddir]; bw = (1000 * ts->io_bytes[ddir]) / runt; - io_p = num2str(ts->io_bytes[ddir], 6, 1, i2p, 8); - bw_p = num2str(bw, 6, 1, i2p, ts->unit_base); iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt; - iops_p = num2str(iops, 6, 1, 0, 0); + iops_p = num2str(iops, 4, 1, 0, N2S_PERSEC); box = gtk_hbox_new(FALSE, 3); gtk_box_pack_start(GTK_BOX(mbox), box, TRUE, FALSE, 3); @@ -1202,9 +1198,17 @@ gtk_box_pack_start(GTK_BOX(main_vbox), box, TRUE, FALSE, 3); label = new_info_label_in_frame(box, "IO"); - gtk_label_set_text(GTK_LABEL(label), io_p); + io_p = num2str(ts->io_bytes[ddir], 4, 1, i2p, N2S_BYTE); + io_palt = num2str(ts->io_bytes[ddir], 4, 1, !i2p, N2S_BYTE); + snprintf(tmp, sizeof(tmp), "%s (%s)", io_p, io_palt); + gtk_label_set_text(GTK_LABEL(label), tmp); + label = new_info_label_in_frame(box, "Bandwidth"); - gtk_label_set_text(GTK_LABEL(label), bw_p); + bw_p = num2str(bw, 4, 1, i2p, ts->unit_base); + bw_palt = num2str(bw, 4, 1, !i2p, ts->unit_base); + snprintf(tmp, sizeof(tmp), "%s (%s)", bw_p, bw_palt); + gtk_label_set_text(GTK_LABEL(label), tmp); + label = new_info_label_in_frame(box, "IOPS"); gtk_label_set_text(GTK_LABEL(label), iops_p); label = new_info_label_in_frame(box, "Runtime (msec)"); @@ -1212,7 +1216,7 @@ if (calc_lat(&ts->bw_stat[ddir], &min[0], &max[0], &mean[0], &dev[0])) { double p_of_agg = 100.0; - const char *bw_str = "KB"; + const char *bw_str = "KiB/s"; char tmp[32]; if (rs->agg[ddir]) { @@ -1221,14 +1225,21 @@ p_of_agg = 100.0; } - if (mean[0] > 999999.9) { - min[0] /= 1000.0; - max[0] /= 1000.0; - mean[0] /= 1000.0; - dev[0] /= 1000.0; - bw_str = "MB"; + if (mean[0] > 1073741824.9) { + min[0] /= 1048576.0; + max[0] /= 1048576.0; + mean[0] /= 1048576.0; + dev[0] /= 1048576.0; + bw_str = "GiB/s"; } + if (mean[0] > 1047575.9) { + min[0] /= 1024.0; + max[0] /= 1024.0; + mean[0] /= 1024.0; + dev[0] /= 1024.0; + bw_str = "MiB/s"; + } sprintf(tmp, "Bandwidth (%s)", bw_str); frame = gtk_frame_new(tmp); gtk_box_pack_start(GTK_BOX(main_vbox), frame, FALSE, FALSE, 5); @@ -1278,6 +1289,8 @@ free(io_p); free(bw_p); + free(io_palt); + free(bw_palt); free(iops_p); } diff -Nru fio-2.16/gettime.c fio-3.1/gettime.c --- fio-2.16/gettime.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/gettime.c 2017-09-28 10:23:20.000000000 +0000 @@ -15,19 +15,22 @@ #if defined(ARCH_HAVE_CPU_CLOCK) #ifndef ARCH_CPU_CLOCK_CYCLES_PER_USEC -static unsigned long cycles_per_usec; -static unsigned long inv_cycles_per_usec; -static uint64_t max_cycles_for_mult; +static unsigned long cycles_per_msec; +static unsigned long long cycles_start; +static unsigned long long clock_mult; +static unsigned long long max_cycles_mask; +static unsigned long long nsecs_for_max_cycles; +static unsigned int clock_shift; +static unsigned int max_cycles_shift; +#define MAX_CLOCK_SEC 60*60 #endif #ifdef ARCH_CPU_CLOCK_WRAPS -static unsigned long long cycles_start, cycles_wrap; +static unsigned int cycles_wrap; #endif #endif -int tsc_reliable = 0; +bool tsc_reliable = false; struct tv_valid { - uint64_t last_cycles; - int last_tv_valid; int warned; }; #ifdef ARCH_HAVE_CPU_CLOCK @@ -143,31 +146,31 @@ } #endif -static void __fio_gettime(struct timeval *tp) +static void __fio_gettime(struct timespec *tp) { switch (fio_clock_source) { #ifdef CONFIG_GETTIMEOFDAY - case CS_GTOD: - gettimeofday(tp, NULL); + case CS_GTOD: { + struct timeval tv; + gettimeofday(&tv, NULL); + + tp->tv_sec = tv.tv_sec; + tp->tv_nsec = tv.tv_usec * 1000; break; + } #endif #ifdef CONFIG_CLOCK_GETTIME case CS_CGETTIME: { - struct timespec ts; - - if (fill_clock_gettime(&ts) < 0) { + if (fill_clock_gettime(tp) < 0) { log_err("fio: clock_gettime fails\n"); assert(0); } - - tp->tv_sec = ts.tv_sec; - tp->tv_usec = ts.tv_nsec / 1000; break; } #endif #ifdef ARCH_HAVE_CPU_CLOCK case CS_CPUCLOCK: { - uint64_t usecs, t; + uint64_t nsecs, t, multiples; struct tv_valid *tv; #ifdef CONFIG_TLS_THREAD @@ -184,21 +187,17 @@ log_err("fio: double CPU clock wrap\n"); tv->warned = 1; } - - t -= cycles_start; #endif - tv->last_cycles = t; - tv->last_tv_valid = 1; #ifdef ARCH_CPU_CLOCK_CYCLES_PER_USEC - usecs = t / ARCH_CPU_CLOCK_CYCLES_PER_USEC; + nsecs = t / ARCH_CPU_CLOCK_CYCLES_PER_USEC * 1000; #else - if (t < max_cycles_for_mult) - usecs = (t * inv_cycles_per_usec) / 16777216UL; - else - usecs = t / cycles_per_usec; + t -= cycles_start; + multiples = t >> max_cycles_shift; + nsecs = multiples * nsecs_for_max_cycles; + nsecs += ((t & max_cycles_mask) * clock_mult) >> clock_shift; #endif - tp->tv_sec = usecs / 1000000; - tp->tv_usec = usecs % 1000000; + tp->tv_sec = nsecs / 1000000000ULL; + tp->tv_nsec = nsecs % 1000000000ULL; break; } #endif @@ -209,9 +208,9 @@ } #ifdef FIO_DEBUG_TIME -void fio_gettime(struct timeval *tp, void *caller) +void fio_gettime(struct timespec *tp, void *caller) #else -void fio_gettime(struct timeval *tp, void fio_unused *caller) +void fio_gettime(struct timespec *tp, void fio_unused *caller) #endif { #ifdef FIO_DEBUG_TIME @@ -227,9 +226,9 @@ } #if defined(ARCH_HAVE_CPU_CLOCK) && !defined(ARCH_CPU_CLOCK_CYCLES_PER_USEC) -static unsigned long get_cycles_per_usec(void) +static unsigned long get_cycles_per_msec(void) { - struct timeval s, e; + struct timespec s, e; uint64_t c_s, c_e; enum fio_cs old_cs = fio_clock_source; uint64_t elapsed; @@ -253,7 +252,7 @@ } while (1); fio_clock_source = old_cs; - return (c_e - c_s) / elapsed; + return (c_e - c_s) * 1000 / elapsed; } #define NR_TIME_ITERS 50 @@ -262,12 +261,13 @@ { double delta, mean, S; uint64_t minc, maxc, avg, cycles[NR_TIME_ITERS]; - int i, samples; + int i, samples, sft = 0; + unsigned long long tmp, max_ticks, max_mult; - cycles[0] = get_cycles_per_usec(); + cycles[0] = get_cycles_per_msec(); S = delta = mean = 0.0; for (i = 0; i < NR_TIME_ITERS; i++) { - cycles[i] = get_cycles_per_usec(); + cycles[i] = get_cycles_per_msec(); delta = cycles[i] - mean; if (delta) { mean += delta / (i + 1.0); @@ -304,19 +304,67 @@ dprint(FD_TIME, "cycles[%d]=%llu\n", i, (unsigned long long) cycles[i]); avg /= samples; + cycles_per_msec = avg; dprint(FD_TIME, "avg: %llu\n", (unsigned long long) avg); dprint(FD_TIME, "min=%llu, max=%llu, mean=%f, S=%f\n", (unsigned long long) minc, (unsigned long long) maxc, mean, S); - cycles_per_usec = avg; - inv_cycles_per_usec = 16777216UL / cycles_per_usec; - max_cycles_for_mult = ~0ULL / inv_cycles_per_usec; - dprint(FD_TIME, "inv_cycles_per_usec=%lu\n", inv_cycles_per_usec); -#ifdef ARCH_CPU_CLOCK_WRAPS + max_ticks = MAX_CLOCK_SEC * cycles_per_msec * 1000ULL; + max_mult = ULLONG_MAX / max_ticks; + dprint(FD_TIME, "\n\nmax_ticks=%llu, __builtin_clzll=%d, " + "max_mult=%llu\n", max_ticks, + __builtin_clzll(max_ticks), max_mult); + + /* + * Find the largest shift count that will produce + * a multiplier that does not exceed max_mult + */ + tmp = max_mult * cycles_per_msec / 1000000; + while (tmp > 1) { + tmp >>= 1; + sft++; + dprint(FD_TIME, "tmp=%llu, sft=%u\n", tmp, sft); + } + + clock_shift = sft; + clock_mult = (1ULL << sft) * 1000000 / cycles_per_msec; + dprint(FD_TIME, "clock_shift=%u, clock_mult=%llu\n", clock_shift, + clock_mult); + + /* + * Find the greatest power of 2 clock ticks that is less than the + * ticks in MAX_CLOCK_SEC_2STAGE + */ + max_cycles_shift = max_cycles_mask = 0; + tmp = MAX_CLOCK_SEC * 1000ULL * cycles_per_msec; + dprint(FD_TIME, "tmp=%llu, max_cycles_shift=%u\n", tmp, + max_cycles_shift); + while (tmp > 1) { + tmp >>= 1; + max_cycles_shift++; + dprint(FD_TIME, "tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift); + } + /* + * if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_msec + * here we will have a discontinuity every + * (1ULL << max_cycles_shift) cycles + */ + nsecs_for_max_cycles = ((1ULL << max_cycles_shift) * clock_mult) + >> clock_shift; + + /* Use a bitmask to calculate ticks % (1ULL << max_cycles_shift) */ + for (tmp = 0; tmp < max_cycles_shift; tmp++) + max_cycles_mask |= 1ULL << tmp; + + dprint(FD_TIME, "max_cycles_shift=%u, 2^max_cycles_shift=%llu, " + "nsecs_for_max_cycles=%llu, " + "max_cycles_mask=%016llx\n", + max_cycles_shift, (1ULL << max_cycles_shift), + nsecs_for_max_cycles, max_cycles_mask); + cycles_start = get_cpu_clock(); dprint(FD_TIME, "cycles_start=%llu\n", cycles_start); -#endif return 0; } #else @@ -365,7 +413,7 @@ fio_clock_source_inited = fio_clock_source; if (calibrate_cpu_clock()) - tsc_reliable = 0; + tsc_reliable = false; /* * If the arch sets tsc_reliable != 0, then it must be good enough @@ -377,14 +425,35 @@ fio_clock_source = CS_CPUCLOCK; } else if (fio_clock_source == CS_CPUCLOCK) log_info("fio: clocksource=cpu may not be reliable\n"); + dprint(FD_TIME, "gettime: clocksource=%d\n", (int) fio_clock_source); } -uint64_t utime_since(const struct timeval *s, const struct timeval *e) +uint64_t ntime_since(const struct timespec *s, const struct timespec *e) +{ + int64_t sec, nsec; + + sec = e->tv_sec - s->tv_sec; + nsec = e->tv_nsec - s->tv_nsec; + if (sec > 0 && nsec < 0) { + sec--; + nsec += 1000000000LL; + } + + /* + * time warp bug on some kernels? + */ + if (sec < 0 || (sec == 0 && nsec < 0)) + return 0; + + return nsec + (sec * 1000000000LL); +} + +uint64_t utime_since(const struct timespec *s, const struct timespec *e) { int64_t sec, usec; sec = e->tv_sec - s->tv_sec; - usec = e->tv_usec - s->tv_usec; + usec = (e->tv_nsec - s->tv_nsec) / 1000; if (sec > 0 && usec < 0) { sec--; usec += 1000000; @@ -399,20 +468,26 @@ return usec + (sec * 1000000); } -uint64_t utime_since_now(const struct timeval *s) +uint64_t utime_since_now(const struct timespec *s) { - struct timeval t; + struct timespec t; +#ifdef FIO_DEBUG_TIME + void *p = __builtin_return_address(0); + fio_gettime(&t, p); +#else fio_gettime(&t, NULL); +#endif + return utime_since(s, &t); } -uint64_t mtime_since(const struct timeval *s, const struct timeval *e) +uint64_t mtime_since_tv(const struct timeval *s, const struct timeval *e) { - long sec, usec; + int64_t sec, usec; sec = e->tv_sec - s->tv_sec; - usec = e->tv_usec - s->tv_usec; + usec = (e->tv_usec - s->tv_usec); if (sec > 0 && usec < 0) { sec--; usec += 1000000; @@ -426,16 +501,40 @@ return sec + usec; } -uint64_t mtime_since_now(const struct timeval *s) +uint64_t mtime_since_now(const struct timespec *s) { - struct timeval t; + struct timespec t; +#ifdef FIO_DEBUG_TIME void *p = __builtin_return_address(0); fio_gettime(&t, p); +#else + fio_gettime(&t, NULL); +#endif + return mtime_since(s, &t); } -uint64_t time_since_now(const struct timeval *s) +uint64_t mtime_since(const struct timespec *s, const struct timespec *e) +{ + int64_t sec, usec; + + sec = e->tv_sec - s->tv_sec; + usec = (e->tv_nsec - s->tv_nsec) / 1000; + if (sec > 0 && usec < 0) { + sec--; + usec += 1000000; + } + + if (sec < 0 || (sec == 0 && usec < 0)) + return 0; + + sec *= 1000; + usec /= 1000; + return sec + usec; +} + +uint64_t time_since_now(const struct timespec *s) { return mtime_since_now(s) / 1000; } @@ -444,7 +543,7 @@ defined(CONFIG_SFAA) #define CLOCK_ENTRIES_DEBUG 100000 -#define CLOCK_ENTRIES_TEST 10000 +#define CLOCK_ENTRIES_TEST 1000 struct clock_entry { uint32_t seq; diff -Nru fio-2.16/gettime.h fio-3.1/gettime.h --- fio-2.16/gettime.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/gettime.h 2017-09-28 10:23:20.000000000 +0000 @@ -13,27 +13,27 @@ CS_INVAL, }; -extern void fio_gettime(struct timeval *, void *); +extern void fio_gettime(struct timespec *, void *); extern void fio_gtod_init(void); extern void fio_clock_init(void); extern int fio_start_gtod_thread(void); extern int fio_monotonic_clocktest(int debug); extern void fio_local_clock_init(int); -extern struct timeval *fio_tv; +extern struct timespec *fio_ts; -static inline int fio_gettime_offload(struct timeval *tv) +static inline int fio_gettime_offload(struct timespec *ts) { time_t last_sec; - if (!fio_tv) + if (!fio_ts) return 0; do { read_barrier(); - last_sec = tv->tv_sec = fio_tv->tv_sec; - tv->tv_usec = fio_tv->tv_usec; - } while (fio_tv->tv_sec != last_sec); + last_sec = ts->tv_sec = fio_ts->tv_sec; + ts->tv_nsec = fio_ts->tv_nsec; + } while (fio_ts->tv_sec != last_sec); return 1; } diff -Nru fio-2.16/gettime-thread.c fio-3.1/gettime-thread.c --- fio-2.16/gettime-thread.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/gettime-thread.c 2017-09-28 10:23:20.000000000 +0000 @@ -6,30 +6,30 @@ #include "fio.h" #include "smalloc.h" -struct timeval *fio_tv = NULL; +struct timespec *fio_ts = NULL; int fio_gtod_offload = 0; static pthread_t gtod_thread; static os_cpu_mask_t fio_gtod_cpumask; void fio_gtod_init(void) { - if (fio_tv) + if (fio_ts) return; - fio_tv = smalloc(sizeof(struct timeval)); - if (!fio_tv) + fio_ts = smalloc(sizeof(*fio_ts)); + if (!fio_ts) log_err("fio: smalloc pool exhausted\n"); } static void fio_gtod_update(void) { - if (fio_tv) { + if (fio_ts) { struct timeval __tv; gettimeofday(&__tv, NULL); - fio_tv->tv_sec = __tv.tv_sec; + fio_ts->tv_sec = __tv.tv_sec; write_barrier(); - fio_tv->tv_usec = __tv.tv_usec; + fio_ts->tv_nsec = __tv.tv_usec * 1000; write_barrier(); } } diff -Nru fio-2.16/gfio.c fio-3.1/gfio.c --- fio-2.16/gfio.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/gfio.c 2017-09-28 10:23:20.000000000 +0000 @@ -1215,7 +1215,7 @@ { const char *authors[] = { "Jens Axboe ", - "Stephen Carmeron ", + "Stephen Cameron ", NULL }; const char *license[] = { @@ -1240,10 +1240,10 @@ "program-name", "gfio", "comments", "Gtk2 UI for fio", "license", license_trans, - "website", "http://git.kernel.dk/?p=fio.git;a=summary", + "website", "http://git.kernel.dk/cgit/fio/", "authors", authors, "version", fio_version_string, - "copyright", "© 2012 Jens Axboe ", + "copyright", "© 2012-2017 Jens Axboe ", "logo-icon-name", "fio", /* Must be last: */ "wrap-license", TRUE, @@ -1271,7 +1271,7 @@ { "Quit", GTK_STOCK_QUIT, NULL, "Q", NULL, G_CALLBACK(quit_clicked) }, { "About", GTK_STOCK_ABOUT, NULL, NULL, NULL, G_CALLBACK(about_dialog) }, }; -static gint nmenu_items = sizeof(menu_items) / sizeof(menu_items[0]); +static gint nmenu_items = ARRAY_SIZE(menu_items); static const gchar *ui_string = " \ \ @@ -1386,7 +1386,7 @@ g_signal_connect(ge->eta.names, "changed", G_CALLBACK(combo_entry_changed), ge); g_signal_connect(ge->eta.names, "destroy", G_CALLBACK(combo_entry_destroy), ge); ge->eta.iotype.entry = new_info_entry_in_frame(probe_box, "IO"); - ge->eta.bs.entry = new_info_entry_in_frame(probe_box, "Blocksize (Read/Write)"); + ge->eta.bs.entry = new_info_entry_in_frame(probe_box, "Blocksize (Read/Write/Trim)"); ge->eta.ioengine.entry = new_info_entry_in_frame(probe_box, "IO Engine"); ge->eta.iodepth.entry = new_info_entry_in_frame(probe_box, "IO Depth"); ge->eta.jobs = new_info_entry_in_frame(probe_box, "Jobs"); @@ -1395,11 +1395,11 @@ probe_box = gtk_hbox_new(FALSE, 3); gtk_box_pack_start(GTK_BOX(probe_frame), probe_box, FALSE, FALSE, 3); ge->eta.read_bw = new_info_entry_in_frame_rgb(probe_box, "Read BW", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B); - ge->eta.read_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B); + ge->eta.read_iops = new_info_entry_in_frame_rgb(probe_box, "Read IOPS", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B); ge->eta.write_bw = new_info_entry_in_frame_rgb(probe_box, "Write BW", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B); - ge->eta.write_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B); + ge->eta.write_iops = new_info_entry_in_frame_rgb(probe_box, "Write IOPS", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B); ge->eta.trim_bw = new_info_entry_in_frame_rgb(probe_box, "Trim BW", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B); - ge->eta.trim_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B); + ge->eta.trim_iops = new_info_entry_in_frame_rgb(probe_box, "Trim IOPS", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B); /* * Only add this if we have a commit rate diff -Nru fio-2.16/.gitignore fio-3.1/.gitignore --- fio-2.16/.gitignore 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/.gitignore 2017-09-28 10:23:20.000000000 +0000 @@ -10,3 +10,4 @@ y.tab.* lex.yy.c *.un~ +doc/output diff -Nru fio-2.16/goptions.c fio-3.1/goptions.c --- fio-2.16/goptions.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/goptions.c 2017-09-28 10:23:20.000000000 +0000 @@ -826,7 +826,7 @@ unsigned long long *p, unsigned int idx) { struct gopt_str_val *g; - const gchar *postfix[] = { "B", "KB", "MB", "GB", "PB", "TB", "" }; + const gchar *postfix[] = { "B", "KiB", "MiB", "GiB", "PiB", "PiB", "" }; GtkWidget *label; int i; diff -Nru fio-2.16/helper_thread.c fio-3.1/helper_thread.c --- fio-2.16/helper_thread.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/helper_thread.c 2017-09-28 10:23:20.000000000 +0000 @@ -71,45 +71,45 @@ { struct helper_data *hd = data; unsigned int msec_to_next_event, next_log, next_ss = STEADYSTATE_MSEC; - struct timeval tv, last_du, last_ss; + struct timeval tv; + struct timespec ts, last_du, last_ss; int ret = 0; sk_out_assign(hd->sk_out); gettimeofday(&tv, NULL); - memcpy(&last_du, &tv, sizeof(tv)); - memcpy(&last_ss, &tv, sizeof(tv)); + ts.tv_sec = tv.tv_sec; + ts.tv_nsec = tv.tv_usec * 1000; + memcpy(&last_du, &ts, sizeof(ts)); + memcpy(&last_ss, &ts, sizeof(ts)); fio_mutex_up(hd->startup_mutex); msec_to_next_event = DISK_UTIL_MSEC; while (!ret && !hd->exit) { - struct timespec ts; - struct timeval now; uint64_t since_du, since_ss = 0; - timeval_add_msec(&tv, msec_to_next_event); - ts.tv_sec = tv.tv_sec; - ts.tv_nsec = tv.tv_usec * 1000; + timespec_add_msec(&ts, msec_to_next_event); pthread_mutex_lock(&hd->lock); pthread_cond_timedwait(&hd->cond, &hd->lock, &ts); - gettimeofday(&now, NULL); + gettimeofday(&tv, NULL); + ts.tv_sec = tv.tv_sec; + ts.tv_nsec = tv.tv_usec * 1000; if (hd->reset) { - memcpy(&tv, &now, sizeof(tv)); - memcpy(&last_du, &now, sizeof(last_du)); - memcpy(&last_ss, &now, sizeof(last_ss)); + memcpy(&last_du, &ts, sizeof(ts)); + memcpy(&last_ss, &ts, sizeof(ts)); hd->reset = 0; } pthread_mutex_unlock(&hd->lock); - since_du = mtime_since(&last_du, &now); + since_du = mtime_since(&last_du, &ts); if (since_du >= DISK_UTIL_MSEC || DISK_UTIL_MSEC - since_du < 10) { ret = update_io_ticks(); - timeval_add_msec(&last_du, DISK_UTIL_MSEC); + timespec_add_msec(&last_du, DISK_UTIL_MSEC); msec_to_next_event = DISK_UTIL_MSEC; if (since_du >= DISK_UTIL_MSEC) msec_to_next_event -= (since_du - DISK_UTIL_MSEC); @@ -126,10 +126,10 @@ next_log = DISK_UTIL_MSEC; if (steadystate_enabled) { - since_ss = mtime_since(&last_ss, &now); + since_ss = mtime_since(&last_ss, &ts); if (since_ss >= STEADYSTATE_MSEC || STEADYSTATE_MSEC - since_ss < 10) { steadystate_check(); - timeval_add_msec(&last_ss, since_ss); + timespec_add_msec(&last_ss, since_ss); if (since_ss > STEADYSTATE_MSEC) next_ss = STEADYSTATE_MSEC - (since_ss - STEADYSTATE_MSEC); else diff -Nru fio-2.16/HOWTO fio-3.1/HOWTO --- fio-2.16/HOWTO 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/HOWTO 2017-09-28 10:23:20.000000000 +0000 @@ -1,2415 +1,3718 @@ -Table of contents ------------------ +How fio works +------------- + +The first step in getting fio to simulate a desired I/O workload, is writing a +job file describing that specific setup. A job file may contain any number of +threads and/or files -- the typical contents of the job file is a *global* +section defining shared parameters, and one or more job sections describing the +jobs involved. When run, fio parses this file and sets everything up as +described. If we break down a job from top to bottom, it contains the following +basic parameters: + +`I/O type`_ + + Defines the I/O pattern issued to the file(s). We may only be reading + sequentially from this file(s), or we may be writing randomly. Or even + mixing reads and writes, sequentially or randomly. + Should we be doing buffered I/O, or direct/raw I/O? + +`Block size`_ + + In how large chunks are we issuing I/O? This may be a single value, + or it may describe a range of block sizes. + +`I/O size`_ + + How much data are we going to be reading/writing. + +`I/O engine`_ + + How do we issue I/O? We could be memory mapping the file, we could be + using regular read/write, we could be using splice, async I/O, or even + SG (SCSI generic sg). + +`I/O depth`_ + + If the I/O engine is async, how large a queuing depth do we want to + maintain? + + +`Target file/device`_ + + How many files are we spreading the workload over. + +`Threads, processes and job synchronization`_ + + How many threads or processes should we spread this workload over. + +The above are the basic parameters defined for a workload, in addition there's a +multitude of parameters that modify other aspects of how this job behaves. + + +Command line options +-------------------- + +.. option:: --debug=type + + Enable verbose tracing `type` of various fio actions. May be ``all`` for all types + or individual types separated by a comma (e.g. ``--debug=file,mem`` will + enable file and memory debugging). Currently, additional logging is + available for: + + *process* + Dump info related to processes. + *file* + Dump info related to file actions. + *io* + Dump info related to I/O queuing. + *mem* + Dump info related to memory allocations. + *blktrace* + Dump info related to blktrace setup. + *verify* + Dump info related to I/O verification. + *all* + Enable all debug options. + *random* + Dump info related to random offset generation. + *parse* + Dump info related to option matching and parsing. + *diskutil* + Dump info related to disk utilization updates. + *job:x* + Dump info only related to job number x. + *mutex* + Dump info only related to mutex up/down ops. + *profile* + Dump info related to profile extensions. + *time* + Dump info related to internal time keeping. + *net* + Dump info related to networking connections. + *rate* + Dump info related to I/O rate switching. + *compress* + Dump info related to log compress/decompress. + *?* or *help* + Show available debug options. + +.. option:: --parse-only + + Parse options only, don't start any I/O. + +.. option:: --output=filename + + Write output to file `filename`. + +.. option:: --output-format=format + + Set the reporting `format` to `normal`, `terse`, `json`, or `json+`. Multiple + formats can be selected, separated by a comma. `terse` is a CSV based + format. `json+` is like `json`, except it adds a full dump of the latency + buckets. + +.. option:: --bandwidth-log + + Generate aggregate bandwidth logs. + +.. option:: --minimal + + Print statistics in a terse, semicolon-delimited format. + +.. option:: --append-terse + + Print statistics in selected mode AND terse, semicolon-delimited format. + **Deprecated**, use :option:`--output-format` instead to select multiple + formats. + +.. option:: --terse-version=version + + Set terse `version` output format (default 3, or 2 or 4 or 5). + +.. option:: --version + + Print version information and exit. + +.. option:: --help + + Print a summary of the command line options and exit. + +.. option:: --cpuclock-test + + Perform test and validation of internal CPU clock. + +.. option:: --crctest=[test] + + Test the speed of the built-in checksumming functions. If no argument is + given, all of them are tested. Alternatively, a comma separated list can + be passed, in which case the given ones are tested. + +.. option:: --cmdhelp=command + + Print help information for `command`. May be ``all`` for all commands. + +.. option:: --enghelp=[ioengine[,command]] + + List all commands defined by `ioengine`, or print help for `command` + defined by `ioengine`. If no `ioengine` is given, list all + available ioengines. + +.. option:: --showcmd=jobfile + + Convert `jobfile` to a set of command-line options. + +.. option:: --readonly + + Turn on safety read-only checks, preventing writes. The ``--readonly`` + option is an extra safety guard to prevent users from accidentally starting + a write workload when that is not desired. Fio will only write if + `rw=write/randwrite/rw/randrw` is given. This extra safety net can be used + as an extra precaution as ``--readonly`` will also enable a write check in + the I/O engine core to prevent writes due to unknown user space bug(s). + +.. option:: --eta=when + + Specifies when real-time ETA estimate should be printed. `when` may be + `always`, `never` or `auto`. + +.. option:: --eta-newline=time + + Force a new line for every `time` period passed. When the unit is omitted, + the value is interpreted in seconds. + +.. option:: --status-interval=time + + Force a full status dump of cumulative (from job start) values at `time` + intervals. This option does *not* provide per-period measurements. So + values such as bandwidth are running averages. When the time unit is omitted, + `time` is interpreted in seconds. + +.. option:: --section=name + + Only run specified section `name` in job file. Multiple sections can be specified. + The ``--section`` option allows one to combine related jobs into one file. + E.g. one job file could define light, moderate, and heavy sections. Tell + fio to run only the "heavy" section by giving ``--section=heavy`` + command line option. One can also specify the "write" operations in one + section and "verify" operation in another section. The ``--section`` option + only applies to job sections. The reserved *global* section is always + parsed and used. + +.. option:: --alloc-size=kb + + Set the internal smalloc pool size to `kb` in KiB. The + ``--alloc-size`` switch allows one to use a larger pool size for smalloc. + If running large jobs with randommap enabled, fio can run out of memory. + Smalloc is an internal allocator for shared structures from a fixed size + memory pool and can grow to 16 pools. The pool size defaults to 16MiB. + + NOTE: While running :file:`.fio_smalloc.*` backing store files are visible + in :file:`/tmp`. + +.. option:: --warnings-fatal + + All fio parser warnings are fatal, causing fio to exit with an + error. + +.. option:: --max-jobs=nr + + Set the maximum number of threads/processes to support to `nr`. + +.. option:: --server=args + + Start a backend server, with `args` specifying what to listen to. + See `Client/Server`_ section. + +.. option:: --daemonize=pidfile + + Background a fio server, writing the pid to the given `pidfile` file. + +.. option:: --client=hostname + + Instead of running the jobs locally, send and run them on the given `hostname` + or set of `hostname`s. See `Client/Server`_ section. + +.. option:: --remote-config=file + + Tell fio server to load this local `file`. + +.. option:: --idle-prof=option + + Report CPU idleness. `option` is one of the following: + + **calibrate** + Run unit work calibration only and exit. + + **system** + Show aggregate system idleness and unit work. + + **percpu** + As **system** but also show per CPU idleness. + +.. option:: --inflate-log=log + + Inflate and output compressed `log`. + +.. option:: --trigger-file=file + + Execute trigger command when `file` exists. + +.. option:: --trigger-timeout=time + + Execute trigger at this `time`. + +.. option:: --trigger=command + + Set this `command` as local trigger. + +.. option:: --trigger-remote=command + + Set this `command` as remote trigger. + +.. option:: --aux-path=path + + Use this `path` for fio state generated files. + +Any parameters following the options will be assumed to be job files, unless +they match a job file parameter. Multiple job files can be listed and each job +file will be regarded as a separate group. Fio will :option:`stonewall` +execution between each group. + + +Job file format +--------------- + +As previously described, fio accepts one or more job files describing what it is +supposed to do. The job file format is the classic ini file, where the names +enclosed in [] brackets define the job name. You are free to use any ASCII name +you want, except *global* which has special meaning. Following the job name is +a sequence of zero or more parameters, one per line, that define the behavior of +the job. If the first character in a line is a ';' or a '#', the entire line is +discarded as a comment. + +A *global* section sets defaults for the jobs described in that file. A job may +override a *global* section parameter, and a job file may even have several +*global* sections if so desired. A job is only affected by a *global* section +residing above it. + +The :option:`--cmdhelp` option also lists all options. If used with a `command` +argument, :option:`--cmdhelp` will detail the given `command`. + +See the `examples/` directory for inspiration on how to write job files. Note +the copyright and license requirements currently apply to `examples/` files. + +So let's look at a really simple job file that defines two processes, each +randomly reading from a 128MiB file: + +.. code-block:: ini + + ; -- start job file -- + [global] + rw=randread + size=128m + + [job1] + + [job2] + + ; -- end job file -- + +As you can see, the job file sections themselves are empty as all the described +parameters are shared. As no :option:`filename` option is given, fio makes up a +`filename` for each of the jobs as it sees fit. On the command line, this job +would look as follows:: + +$ fio --name=global --rw=randread --size=128m --name=job1 --name=job2 + + +Let's look at an example that has a number of processes writing randomly to +files: + +.. code-block:: ini + + ; -- start job file -- + [random-writers] + ioengine=libaio + iodepth=4 + rw=randwrite + bs=32k + direct=0 + size=64m + numjobs=4 + ; -- end job file -- + +Here we have no *global* section, as we only have one job defined anyway. We +want to use async I/O here, with a depth of 4 for each file. We also increased +the buffer size used to 32KiB and define numjobs to 4 to fork 4 identical +jobs. The result is 4 processes each randomly writing to their own 64MiB +file. Instead of using the above job file, you could have given the parameters +on the command line. For this case, you would specify:: + +$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4 + +When fio is utilized as a basis of any reasonably large test suite, it might be +desirable to share a set of standardized settings across multiple job files. +Instead of copy/pasting such settings, any section may pull in an external +:file:`filename.fio` file with *include filename* directive, as in the following +example:: + + ; -- start job file including.fio -- + [global] + filename=/tmp/test + filesize=1m + include glob-include.fio + + [test] + rw=randread + bs=4k + time_based=1 + runtime=10 + include test-include.fio + ; -- end job file including.fio -- + +.. code-block:: ini + + ; -- start job file glob-include.fio -- + thread=1 + group_reporting=1 + ; -- end job file glob-include.fio -- + +.. code-block:: ini + + ; -- start job file test-include.fio -- + ioengine=libaio + iodepth=4 + ; -- end job file test-include.fio -- + +Settings pulled into a section apply to that section only (except *global* +section). Include directives may be nested in that any included file may contain +further include directive(s). Include files may not contain [] sections. + + +Environment variables +~~~~~~~~~~~~~~~~~~~~~ + +Fio also supports environment variable expansion in job files. Any sub-string of +the form ``${VARNAME}`` as part of an option value (in other words, on the right +of the '='), will be expanded to the value of the environment variable called +`VARNAME`. If no such environment variable is defined, or `VARNAME` is the +empty string, the empty string will be substituted. + +As an example, let's look at a sample fio invocation and job file:: + +$ SIZE=64m NUMJOBS=4 fio jobfile.fio + +.. code-block:: ini + + ; -- start job file -- + [random-writers] + rw=randwrite + size=${SIZE} + numjobs=${NUMJOBS} + ; -- end job file -- + +This will expand to the following equivalent job file at runtime: + +.. code-block:: ini + + ; -- start job file -- + [random-writers] + rw=randwrite + size=64m + numjobs=4 + ; -- end job file -- + +Fio ships with a few example job files, you can also look there for inspiration. + +Reserved keywords +~~~~~~~~~~~~~~~~~ + +Additionally, fio has a set of reserved keywords that will be replaced +internally with the appropriate value. Those keywords are: + +**$pagesize** + + The architecture page size of the running system. + +**$mb_memory** + + Megabytes of total memory in the system. + +**$ncpus** + + Number of online available CPUs. + +These can be used on the command line or in the job file, and will be +automatically substituted with the current system values when the job is +run. Simple math is also supported on these keywords, so you can perform actions +like:: + + size=8*$mb_memory + +and get that properly expanded to 8 times the size of memory in the machine. + + +Job file parameters +------------------- + +This section describes in details each parameter associated with a job. Some +parameters take an option of a given type, such as an integer or a +string. Anywhere a numeric value is required, an arithmetic expression may be +used, provided it is surrounded by parentheses. Supported operators are: + + - addition (+) + - subtraction (-) + - multiplication (*) + - division (/) + - modulus (%) + - exponentiation (^) + +For time values in expressions, units are microseconds by default. This is +different than for time values not in expressions (not enclosed in +parentheses). The following types are used: + + +Parameter types +~~~~~~~~~~~~~~~ + +**str** + String: A sequence of alphanumeric characters. + +**time** + Integer with possible time suffix. Without a unit value is interpreted as + seconds unless otherwise specified. Accepts a suffix of 'd' for days, 'h' for + hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and + 'us' (or 'usec') for microseconds. For example, use 10m for 10 minutes. + +.. _int: + +**int** + Integer. A whole number value, which may contain an integer prefix + and an integer suffix: + + [*integer prefix*] **number** [*integer suffix*] + + The optional *integer prefix* specifies the number's base. The default + is decimal. *0x* specifies hexadecimal. + + The optional *integer suffix* specifies the number's units, and includes an + optional unit prefix and an optional unit. For quantities of data, the + default unit is bytes. For quantities of time, the default unit is seconds + unless otherwise specified. + + With :option:`kb_base`\=1000, fio follows international standards for unit + prefixes. To specify power-of-10 decimal values defined in the + International System of Units (SI): + + * *K* -- means kilo (K) or 1000 + * *M* -- means mega (M) or 1000**2 + * *G* -- means giga (G) or 1000**3 + * *T* -- means tera (T) or 1000**4 + * *P* -- means peta (P) or 1000**5 + + To specify power-of-2 binary values defined in IEC 80000-13: + + * *Ki* -- means kibi (Ki) or 1024 + * *Mi* -- means mebi (Mi) or 1024**2 + * *Gi* -- means gibi (Gi) or 1024**3 + * *Ti* -- means tebi (Ti) or 1024**4 + * *Pi* -- means pebi (Pi) or 1024**5 + + With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite + from those specified in the SI and IEC 80000-13 standards to provide + compatibility with old scripts. For example, 4k means 4096. + + For quantities of data, an optional unit of 'B' may be included + (e.g., 'kB' is the same as 'k'). + + The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega, + not milli). 'b' and 'B' both mean byte, not bit. + + Examples with :option:`kb_base`\=1000: + + * *4 KiB*: 4096, 4096b, 4096B, 4ki, 4kib, 4kiB, 4Ki, 4KiB + * *1 MiB*: 1048576, 1mi, 1024ki + * *1 MB*: 1000000, 1m, 1000k + * *1 TiB*: 1099511627776, 1ti, 1024gi, 1048576mi + * *1 TB*: 1000000000, 1t, 1000m, 1000000k + + Examples with :option:`kb_base`\=1024 (default): + + * *4 KiB*: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB + * *1 MiB*: 1048576, 1m, 1024k + * *1 MB*: 1000000, 1mi, 1000ki + * *1 TiB*: 1099511627776, 1t, 1024g, 1048576m + * *1 TB*: 1000000000, 1ti, 1000mi, 1000000ki + + To specify times (units are not case sensitive): + + * *D* -- means days + * *H* -- means hours + * *M* -- means minutes + * *s* -- or sec means seconds (default) + * *ms* -- or *msec* means milliseconds + * *us* -- or *usec* means microseconds + + If the option accepts an upper and lower range, use a colon ':' or + minus '-' to separate such values. See :ref:`irange `. + If the lower value specified happens to be larger than the upper value + the two values are swapped. + +.. _bool: + +**bool** + Boolean. Usually parsed as an integer, however only defined for + true and false (1 and 0). + +.. _irange: + +**irange** + Integer range with suffix. Allows value range to be given, such as + 1024-4096. A colon may also be used as the separator, e.g. 1k:4k. If the + option allows two sets of ranges, they can be specified with a ',' or '/' + delimiter: 1k-4k/8k-32k. Also see :ref:`int `. + +**float_list** + A list of floating point numbers, separated by a ':' character. + +With the above in mind, here follows the complete list of fio job parameters. + + +Units +~~~~~ + +.. option:: kb_base=int + + Select the interpretation of unit prefixes in input parameters. + + **1000** + Inputs comply with IEC 80000-13 and the International + System of Units (SI). Use: + + - power-of-2 values with IEC prefixes (e.g., KiB) + - power-of-10 values with SI prefixes (e.g., kB) + + **1024** + Compatibility mode (default). To avoid breaking old scripts: + + - power-of-2 values with SI prefixes + - power-of-10 values with IEC prefixes + + See :option:`bs` for more details on input parameters. + + Outputs always use correct prefixes. Most outputs include both + side-by-side, like:: + + bw=2383.3kB/s (2327.4KiB/s) + + If only one value is reported, then kb_base selects the one to use: + + **1000** -- SI prefixes + + **1024** -- IEC prefixes + +.. option:: unit_base=int + + Base unit for reporting. Allowed values are: + + **0** + Use auto-detection (default). + **8** + Byte based. + **1** + Bit based. + + +Job description +~~~~~~~~~~~~~~~ + +.. option:: name=str + + ASCII name of the job. This may be used to override the name printed by fio + for this job. Otherwise the job name is used. On the command line this + parameter has the special purpose of also signaling the start of a new job. + +.. option:: description=str + + Text description of the job. Doesn't do anything except dump this text + description when this job is run. It's not parsed. + +.. option:: loops=int + + Run the specified number of iterations of this job. Used to repeat the same + workload a given number of times. Defaults to 1. + +.. option:: numjobs=int + + Create the specified number of clones of this job. Each clone of job + is spawned as an independent thread or process. May be used to setup a + larger number of threads/processes doing the same thing. Each thread is + reported separately; to see statistics for all clones as a whole, use + :option:`group_reporting` in conjunction with :option:`new_group`. + See :option:`--max-jobs`. Default: 1. + + +Time related parameters +~~~~~~~~~~~~~~~~~~~~~~~ + +.. option:: runtime=time + + Tell fio to terminate processing after the specified period of time. It + can be quite hard to determine for how long a specified job will run, so + this parameter is handy to cap the total runtime to a given time. When + the unit is omitted, the value is intepreted in seconds. + +.. option:: time_based + + If set, fio will run for the duration of the :option:`runtime` specified + even if the file(s) are completely read or written. It will simply loop over + the same workload as many times as the :option:`runtime` allows. + +.. option:: startdelay=irange(time) + + Delay the start of job for the specified amount of time. Can be a single + value or a range. When given as a range, each thread will choose a value + randomly from within the range. Value is in seconds if a unit is omitted. + +.. option:: ramp_time=time + + If set, fio will run the specified workload for this amount of time before + logging any performance numbers. Useful for letting performance settle + before logging results, thus minimizing the runtime required for stable + results. Note that the ``ramp_time`` is considered lead in time for a job, + thus it will increase the total runtime if a special timeout or + :option:`runtime` is specified. When the unit is omitted, the value is + given in seconds. + +.. option:: clocksource=str + + Use the given clocksource as the base of timing. The supported options are: + + **gettimeofday** + :manpage:`gettimeofday(2)` + + **clock_gettime** + :manpage:`clock_gettime(2)` + + **cpu** + Internal CPU clock source + + cpu is the preferred clocksource if it is reliable, as it is very fast (and + fio is heavy on time calls). Fio will automatically use this clocksource if + it's supported and considered reliable on the system it is running on, + unless another clocksource is specifically set. For x86/x86-64 CPUs, this + means supporting TSC Invariant. + +.. option:: gtod_reduce=bool + + Enable all of the :manpage:`gettimeofday(2)` reducing options + (:option:`disable_clat`, :option:`disable_slat`, :option:`disable_bw_measurement`) plus + reduce precision of the timeout somewhat to really shrink the + :manpage:`gettimeofday(2)` call count. With this option enabled, we only do + about 0.4% of the :manpage:`gettimeofday(2)` calls we would have done if all + time keeping was enabled. + +.. option:: gtod_cpu=int + + Sometimes it's cheaper to dedicate a single thread of execution to just + getting the current time. Fio (and databases, for instance) are very + intensive on :manpage:`gettimeofday(2)` calls. With this option, you can set + one CPU aside for doing nothing but logging current time to a shared memory + location. Then the other threads/processes that run I/O workloads need only + copy that segment, instead of entering the kernel with a + :manpage:`gettimeofday(2)` call. The CPU set aside for doing these time + calls will be excluded from other uses. Fio will manually clear it from the + CPU mask of other jobs. + + +Target file/device +~~~~~~~~~~~~~~~~~~ + +.. option:: directory=str + + Prefix filenames with this directory. Used to place files in a different + location than :file:`./`. You can specify a number of directories by + separating the names with a ':' character. These directories will be + assigned equally distributed to job clones created by :option:`numjobs` as + long as they are using generated filenames. If specific `filename(s)` are + set fio will use the first listed directory, and thereby matching the + `filename` semantic which generates a file each clone if not specified, but + let all clones use the same if set. + + See the :option:`filename` option for information on how to escape "``:``" and + "``\``" characters within the directory path itself. + +.. option:: filename=str + + Fio normally makes up a `filename` based on the job name, thread number, and + file number (see :option:`filename_format`). If you want to share files + between threads in a job or several + jobs with fixed file paths, specify a `filename` for each of them to override + the default. If the ioengine is file based, you can specify a number of files + by separating the names with a ':' colon. So if you wanted a job to open + :file:`/dev/sda` and :file:`/dev/sdb` as the two working files, you would use + ``filename=/dev/sda:/dev/sdb``. This also means that whenever this option is + specified, :option:`nrfiles` is ignored. The size of regular files specified + by this option will be :option:`size` divided by number of files unless an + explicit size is specified by :option:`filesize`. + + Each colon and backslash in the wanted path must be escaped with a ``\`` + character. For instance, if the path is :file:`/dev/dsk/foo@3,0:c` then you + would use ``filename=/dev/dsk/foo@3,0\:c`` and if the path is + :file:`F:\\filename` then you would use ``filename=F\:\\filename``. + + On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for + the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc. + Note: Windows and FreeBSD prevent write access to areas + of the disk containing in-use data (e.g. filesystems). + + The filename "`-`" is a reserved name, meaning *stdin* or *stdout*. Which + of the two depends on the read/write direction set. + +.. option:: filename_format=str + + If sharing multiple files between jobs, it is usually necessary to have fio + generate the exact names that you want. By default, fio will name a file + based on the default file format specification of + :file:`jobname.jobnumber.filenumber`. With this option, that can be + customized. Fio will recognize and replace the following keywords in this + string: + + **$jobname** + The name of the worker thread or process. + **$jobnum** + The incremental number of the worker thread or process. + **$filenum** + The incremental number of the file for that worker thread or + process. + + To have dependent jobs share a set of files, this option can be set to have + fio generate filenames that are shared between the two. For instance, if + :file:`testfiles.$filenum` is specified, file number 4 for any job will be + named :file:`testfiles.4`. The default of :file:`$jobname.$jobnum.$filenum` + will be used if no other format specifier is given. + +.. option:: unique_filename=bool + + To avoid collisions between networked clients, fio defaults to prefixing any + generated filenames (with a directory specified) with the source of the + client connecting. To disable this behavior, set this option to 0. + +.. option:: opendir=str + + Recursively open any files below directory `str`. + +.. option:: lockfile=str + + Fio defaults to not locking any files before it does I/O to them. If a file + or file descriptor is shared, fio can serialize I/O to that file to make the + end result consistent. This is usual for emulating real workloads that share + files. The lock modes are: + + **none** + No locking. The default. + **exclusive** + Only one thread or process may do I/O at a time, excluding all + others. + **readwrite** + Read-write locking on the file. Many readers may + access the file at the same time, but writes get exclusive access. + +.. option:: nrfiles=int + + Number of files to use for this job. Defaults to 1. The size of files + will be :option:`size` divided by this unless explicit size is specified by + :option:`filesize`. Files are created for each thread separately, and each + file will have a file number within its name by default, as explained in + :option:`filename` section. + + +.. option:: openfiles=int + + Number of files to keep open at the same time. Defaults to the same as + :option:`nrfiles`, can be set smaller to limit the number simultaneous + opens. + +.. option:: file_service_type=str + + Defines how fio decides which file from a job to service next. The following + types are defined: + + **random** + Choose a file at random. + + **roundrobin** + Round robin over opened files. This is the default. + + **sequential** + Finish one file before moving on to the next. Multiple files can + still be open depending on :option:`openfiles`. + + **zipf** + Use a *Zipf* distribution to decide what file to access. + + **pareto** + Use a *Pareto* distribution to decide what file to access. + + **normal** + Use a *Gaussian* (normal) distribution to decide what file to + access. + + **gauss** + Alias for normal. + + For *random*, *roundrobin*, and *sequential*, a postfix can be appended to + tell fio how many I/Os to issue before switching to a new file. For example, + specifying ``file_service_type=random:8`` would cause fio to issue + 8 I/Os before selecting a new file at random. For the non-uniform + distributions, a floating point postfix can be given to influence how the + distribution is skewed. See :option:`random_distribution` for a description + of how that would work. + +.. option:: ioscheduler=str + + Attempt to switch the device hosting the file to the specified I/O scheduler + before running. + +.. option:: create_serialize=bool + + If true, serialize the file creation for the jobs. This may be handy to + avoid interleaving of data files, which may greatly depend on the filesystem + used and even the number of processors in the system. Default: true. + +.. option:: create_fsync=bool + + :manpage:`fsync(2)` the data file after creation. This is the default. + +.. option:: create_on_open=bool + + If true, don't pre-create files but allow the job's open() to create a file + when it's time to do I/O. Default: false -- pre-create all necessary files + when the job starts. + +.. option:: create_only=bool + + If true, fio will only run the setup phase of the job. If files need to be + laid out or updated on disk, only that will be done -- the actual job contents + are not executed. Default: false. + +.. option:: allow_file_create=bool + + If true, fio is permitted to create files as part of its workload. If this + option is false, then fio will error out if + the files it needs to use don't already exist. Default: true. + +.. option:: allow_mounted_write=bool + + If this isn't set, fio will abort jobs that are destructive (e.g. that write) + to what appears to be a mounted device or partition. This should help catch + creating inadvertently destructive tests, not realizing that the test will + destroy data on the mounted file system. Note that some platforms don't allow + writing against a mounted device regardless of this option. Default: false. + +.. option:: pre_read=bool + + If this is given, files will be pre-read into memory before starting the + given I/O operation. This will also clear the :option:`invalidate` flag, + since it is pointless to pre-read and then drop the cache. This will only + work for I/O engines that are seek-able, since they allow you to read the + same data multiple times. Thus it will not work on non-seekable I/O engines + (e.g. network, splice). Default: false. + +.. option:: unlink=bool + + Unlink the job files when done. Not the default, as repeated runs of that + job would then waste time recreating the file set again and again. Default: + false. + +.. option:: unlink_each_loop=bool + + Unlink job files after each iteration or loop. Default: false. + +.. option:: zonesize=int + + Divide a file into zones of the specified size. See :option:`zoneskip`. + +.. option:: zonerange=int + + Give size of an I/O zone. See :option:`zoneskip`. + +.. option:: zoneskip=int + + Skip the specified number of bytes when :option:`zonesize` data has been + read. The two zone options can be used to only do I/O on zones of a file. + + +I/O type +~~~~~~~~ + +.. option:: direct=bool + + If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that + OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous + ioengines don't support direct I/O. Default: false. + +.. option:: atomic=bool + + If value is true, attempt to use atomic direct I/O. Atomic writes are + guaranteed to be stable once acknowledged by the operating system. Only + Linux supports O_ATOMIC right now. + +.. option:: buffered=bool + + If value is true, use buffered I/O. This is the opposite of the + :option:`direct` option. Defaults to true. + +.. option:: readwrite=str, rw=str + + Type of I/O pattern. Accepted values are: + + **read** + Sequential reads. + **write** + Sequential writes. + **trim** + Sequential trims (Linux block devices only). + **randread** + Random reads. + **randwrite** + Random writes. + **randtrim** + Random trims (Linux block devices only). + **rw,readwrite** + Sequential mixed reads and writes. + **randrw** + Random mixed reads and writes. + **trimwrite** + Sequential trim+write sequences. Blocks will be trimmed first, + then the same blocks will be written to. + + Fio defaults to read if the option is not specified. For the mixed I/O + types, the default is to split them 50/50. For certain types of I/O the + result may still be skewed a bit, since the speed may be different. + + It is possible to specify the number of I/Os to do before getting a new + offset by appending ``:`` to the end of the string given. For a + random read, it would look like ``rw=randread:8`` for passing in an offset + modifier with a value of 8. If the suffix is used with a sequential I/O + pattern, then the ** value specified will be **added** to the generated + offset for each I/O turning sequential I/O into sequential I/O with holes. + For instance, using ``rw=write:4k`` will skip 4k for every write. Also see + the :option:`rw_sequencer` option. + +.. option:: rw_sequencer=str + + If an offset modifier is given by appending a number to the ``rw=`` + line, then this option controls how that number modifies the I/O offset + being generated. Accepted values are: + + **sequential** + Generate sequential offset. + **identical** + Generate the same offset. + + ``sequential`` is only useful for random I/O, where fio would normally + generate a new random offset for every I/O. If you append e.g. 8 to randread, + you would get a new random offset for every 8 I/Os. The result would be a + seek for only every 8 I/Os, instead of for every I/O. Use ``rw=randread:8`` + to specify that. As sequential I/O is already sequential, setting + ``sequential`` for that would not result in any differences. ``identical`` + behaves in a similar fashion, except it sends the same offset 8 number of + times before generating a new offset. + +.. option:: unified_rw_reporting=bool + + Fio normally reports statistics on a per data direction basis, meaning that + reads, writes, and trims are accounted and reported separately. If this + option is set fio sums the results and report them as "mixed" instead. + +.. option:: randrepeat=bool + + Seed the random number generator used for random I/O patterns in a + predictable way so the pattern is repeatable across runs. Default: true. + +.. option:: allrandrepeat=bool + + Seed all random number generators in a predictable way so results are + repeatable across runs. Default: false. + +.. option:: randseed=int + + Seed the random number generators based on this seed value, to be able to + control what sequence of output is being generated. If not set, the random + sequence depends on the :option:`randrepeat` setting. + +.. option:: fallocate=str + + Whether pre-allocation is performed when laying down files. + Accepted values are: + + **none** + Do not pre-allocate space. + + **native** + Use a platform's native pre-allocation call but fall back to + **none** behavior if it fails/is not implemented. + + **posix** + Pre-allocate via :manpage:`posix_fallocate(3)`. + + **keep** + Pre-allocate via :manpage:`fallocate(2)` with + FALLOC_FL_KEEP_SIZE set. + + **0** + Backward-compatible alias for **none**. + + **1** + Backward-compatible alias for **posix**. + + May not be available on all supported platforms. **keep** is only available + on Linux. If using ZFS on Solaris this cannot be set to **posix** + because ZFS doesn't support pre-allocation. Default: **native** if any + pre-allocation methods are available, **none** if not. + +.. option:: fadvise_hint=str + + Use :manpage:`posix_fadvise(2)` to advise the kernel on what I/O patterns + are likely to be issued. Accepted values are: + + **0** + Backwards-compatible hint for "no hint". + + **1** + Backwards compatible hint for "advise with fio workload type". This + uses **FADV_RANDOM** for a random workload, and **FADV_SEQUENTIAL** + for a sequential workload. + + **sequential** + Advise using **FADV_SEQUENTIAL**. + + **random** + Advise using **FADV_RANDOM**. + +.. option:: write_hint=str + + Use :manpage:`fcntl(2)` to advise the kernel what life time to expect + from a write. Only supported on Linux, as of version 4.13. Accepted + values are: + + **none** + No particular life time associated with this file. + + **short** + Data written to this file has a short life time. + + **medium** + Data written to this file has a medium life time. + + **long** + Data written to this file has a long life time. + + **extreme** + Data written to this file has a very long life time. + + The values are all relative to each other, and no absolute meaning + should be associated with them. + +.. option:: offset=int + + Start I/O at the provided offset in the file, given as either a fixed size in + bytes or a percentage. If a percentage is given, the next ``blockalign``-ed + offset will be used. Data before the given offset will not be touched. This + effectively caps the file size at `real_size - offset`. Can be combined with + :option:`size` to constrain the start and end range of the I/O workload. + A percentage can be specified by a number between 1 and 100 followed by '%', + for example, ``offset=20%`` to specify 20%. + +.. option:: offset_increment=int + + If this is provided, then the real offset becomes `offset + offset_increment + * thread_number`, where the thread number is a counter that starts at 0 and + is incremented for each sub-job (i.e. when :option:`numjobs` option is + specified). This option is useful if there are several jobs which are + intended to operate on a file in parallel disjoint segments, with even + spacing between the starting points. + +.. option:: number_ios=int + + Fio will normally perform I/Os until it has exhausted the size of the region + set by :option:`size`, or if it exhaust the allocated time (or hits an error + condition). With this setting, the range/size can be set independently of + the number of I/Os to perform. When fio reaches this number, it will exit + normally and report status. Note that this does not extend the amount of I/O + that will be done, it will only stop fio if this condition is met before + other end-of-job criteria. + +.. option:: fsync=int + + If writing to a file, issue an :manpage:`fsync(2)` (or its equivalent) of + the dirty data for every number of blocks given. For example, if you give 32 + as a parameter, fio will sync the file after every 32 writes issued. If fio is + using non-buffered I/O, we may not sync the file. The exception is the sg + I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which + means fio does not periodically issue and wait for a sync to complete. Also + see :option:`end_fsync` and :option:`fsync_on_close`. + +.. option:: fdatasync=int + + Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and + not metadata blocks. In Windows, FreeBSD, and DragonFlyBSD there is no + :manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`. + Defaults to 0, which means fio does not periodically issue and wait for a + data-only sync to complete. + +.. option:: write_barrier=int + + Make every `N-th` write a barrier write. + +.. option:: sync_file_range=str:int + + Use :manpage:`sync_file_range(2)` for every `int` number of write + operations. Fio will track range of writes that have happened since the last + :manpage:`sync_file_range(2)` call. `str` can currently be one or more of: + + **wait_before** + SYNC_FILE_RANGE_WAIT_BEFORE + **write** + SYNC_FILE_RANGE_WRITE + **wait_after** + SYNC_FILE_RANGE_WAIT_AFTER + + So if you do ``sync_file_range=wait_before,write:8``, fio would use + ``SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE`` for every 8 + writes. Also see the :manpage:`sync_file_range(2)` man page. This option is + Linux specific. + +.. option:: overwrite=bool + + If true, writes to a file will always overwrite existing data. If the file + doesn't already exist, it will be created before the write phase begins. If + the file exists and is large enough for the specified write phase, nothing + will be done. Default: false. + +.. option:: end_fsync=bool + + If true, :manpage:`fsync(2)` file contents when a write stage has completed. + Default: false. + +.. option:: fsync_on_close=bool + + If true, fio will :manpage:`fsync(2)` a dirty file on close. This differs + from :option:`end_fsync` in that it will happen on every file close, not + just at the end of the job. Default: false. + +.. option:: rwmixread=int + + Percentage of a mixed workload that should be reads. Default: 50. + +.. option:: rwmixwrite=int + + Percentage of a mixed workload that should be writes. If both + :option:`rwmixread` and :option:`rwmixwrite` is given and the values do not + add up to 100%, the latter of the two will be used to override the + first. This may interfere with a given rate setting, if fio is asked to + limit reads or writes to a certain rate. If that is the case, then the + distribution may be skewed. Default: 50. + +.. option:: random_distribution=str:float[,str:float][,str:float] + + By default, fio will use a completely uniform random distribution when asked + to perform random I/O. Sometimes it is useful to skew the distribution in + specific ways, ensuring that some parts of the data is more hot than others. + fio includes the following distribution models: + + **random** + Uniform random distribution + + **zipf** + Zipf distribution + + **pareto** + Pareto distribution + + **normal** + Normal (Gaussian) distribution + + **zoned** + Zoned random distribution + + When using a **zipf** or **pareto** distribution, an input value is also + needed to define the access pattern. For **zipf**, this is the `Zipf + theta`. For **pareto**, it's the `Pareto power`. Fio includes a test + program, :command:`fio-genzipf`, that can be used visualize what the given input + values will yield in terms of hit rates. If you wanted to use **zipf** with + a `theta` of 1.2, you would use ``random_distribution=zipf:1.2`` as the + option. If a non-uniform model is used, fio will disable use of the random + map. For the **normal** distribution, a normal (Gaussian) deviation is + supplied as a value between 0 and 100. + + For a **zoned** distribution, fio supports specifying percentages of I/O + access that should fall within what range of the file or device. For + example, given a criteria of: + + * 60% of accesses should be to the first 10% + * 30% of accesses should be to the next 20% + * 8% of accesses should be to the next 30% + * 2% of accesses should be to the next 40% + + we can define that through zoning of the random accesses. For the above + example, the user would do:: + + random_distribution=zoned:60/10:30/20:8/30:2/40 + + similarly to how :option:`bssplit` works for setting ranges and percentages + of block sizes. Like :option:`bssplit`, it's possible to specify separate + zones for reads, writes, and trims. If just one set is given, it'll apply to + all of them. + +.. option:: percentage_random=int[,int][,int] + + For a random workload, set how big a percentage should be random. This + defaults to 100%, in which case the workload is fully random. It can be set + from anywhere from 0 to 100. Setting it to 0 would make the workload fully + sequential. Any setting in between will result in a random mix of sequential + and random I/O, at the given percentages. Comma-separated values may be + specified for reads, writes, and trims as described in :option:`blocksize`. + +.. option:: norandommap + + Normally fio will cover every block of the file when doing random I/O. If + this option is given, fio will just get a new random offset without looking + at past I/O history. This means that some blocks may not be read or written, + and that some blocks may be read/written more than once. If this option is + used with :option:`verify` and multiple blocksizes (via :option:`bsrange`), + only intact blocks are verified, i.e., partially-overwritten blocks are + ignored. + +.. option:: softrandommap=bool + + See :option:`norandommap`. If fio runs with the random block map enabled and + it fails to allocate the map, if this option is set it will continue without + a random block map. As coverage will not be as complete as with random maps, + this option is disabled by default. + +.. option:: random_generator=str + + Fio supports the following engines for generating I/O offsets for random I/O: + + **tausworthe** + Strong 2^88 cycle random number generator. + **lfsr** + Linear feedback shift register generator. + **tausworthe64** + Strong 64-bit 2^258 cycle random number generator. + + **tausworthe** is a strong random number generator, but it requires tracking + on the side if we want to ensure that blocks are only read or written + once. **lfsr** guarantees that we never generate the same offset twice, and + it's also less computationally expensive. It's not a true random generator, + however, though for I/O purposes it's typically good enough. **lfsr** only + works with single block sizes, not with workloads that use multiple block + sizes. If used with such a workload, fio may read or write some blocks + multiple times. The default value is **tausworthe**, unless the required + space exceeds 2^32 blocks. If it does, then **tausworthe64** is + selected automatically. + + +Block size +~~~~~~~~~~ + +.. option:: blocksize=int[,int][,int], bs=int[,int][,int] + + The block size in bytes used for I/O units. Default: 4096. A single value + applies to reads, writes, and trims. Comma-separated values may be + specified for reads, writes, and trims. A value not terminated in a comma + applies to subsequent types. + + Examples: + + **bs=256k** + means 256k for reads, writes and trims. + + **bs=8k,32k** + means 8k for reads, 32k for writes and trims. + + **bs=8k,32k,** + means 8k for reads, 32k for writes, and default for trims. + + **bs=,8k** + means default for reads, 8k for writes and trims. + + **bs=,8k,** + means default for reads, 8k for writes, and default for trims. + +.. option:: blocksize_range=irange[,irange][,irange], bsrange=irange[,irange][,irange] + + A range of block sizes in bytes for I/O units. The issued I/O unit will + always be a multiple of the minimum size, unless + :option:`blocksize_unaligned` is set. + + Comma-separated ranges may be specified for reads, writes, and trims as + described in :option:`blocksize`. + + Example: ``bsrange=1k-4k,2k-8k``. + +.. option:: bssplit=str[,str][,str] + + Sometimes you want even finer grained control of the block sizes issued, not + just an even split between them. This option allows you to weight various + block sizes, so that you are able to define a specific amount of block sizes + issued. The format for this option is:: + + bssplit=blocksize/percentage:blocksize/percentage + + for as many block sizes as needed. So if you want to define a workload that + has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would write:: + + bssplit=4k/10:64k/50:32k/40 + + Ordering does not matter. If the percentage is left blank, fio will fill in + the remaining values evenly. So a bssplit option like this one:: + + bssplit=4k/50:1k/:32k/ + + would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always add up + to 100, if bssplit is given a range that adds up to more, it will error out. + + Comma-separated values may be specified for reads, writes, and trims as + described in :option:`blocksize`. + + If you want a workload that has 50% 2k reads and 50% 4k reads, while having + 90% 4k writes and 10% 8k writes, you would specify:: + + bssplit=2k/50:4k/50,4k/90,8k/10 + +.. option:: blocksize_unaligned, bs_unaligned + + If set, fio will issue I/O units with any size within + :option:`blocksize_range`, not just multiples of the minimum size. This + typically won't work with direct I/O, as that normally requires sector + alignment. + +.. option:: bs_is_seq_rand=bool + + If this option is set, fio will use the normal read,write blocksize settings + as sequential,random blocksize settings instead. Any random read or write + will use the WRITE blocksize settings, and any sequential read or write will + use the READ blocksize settings. + +.. option:: blockalign=int[,int][,int], ba=int[,int][,int] + + Boundary to which fio will align random I/O units. Default: + :option:`blocksize`. Minimum alignment is typically 512b for using direct + I/O, though it usually depends on the hardware block size. This option is + mutually exclusive with using a random map for files, so it will turn off + that option. Comma-separated values may be specified for reads, writes, and + trims as described in :option:`blocksize`. + + +Buffers and memory +~~~~~~~~~~~~~~~~~~ + +.. option:: zero_buffers + + Initialize buffers with all zeros. Default: fill buffers with random data. + +.. option:: refill_buffers + + If this option is given, fio will refill the I/O buffers on every + submit. The default is to only fill it at init time and reuse that + data. Only makes sense if zero_buffers isn't specified, naturally. If data + verification is enabled, `refill_buffers` is also automatically enabled. + +.. option:: scramble_buffers=bool + + If :option:`refill_buffers` is too costly and the target is using data + deduplication, then setting this option will slightly modify the I/O buffer + contents to defeat normal de-dupe attempts. This is not enough to defeat + more clever block compression attempts, but it will stop naive dedupe of + blocks. Default: true. + +.. option:: buffer_compress_percentage=int + + If this is set, then fio will attempt to provide I/O buffer content (on + WRITEs) that compresses to the specified level. Fio does this by providing a + mix of random data and a fixed pattern. The fixed pattern is either zeros, + or the pattern specified by :option:`buffer_pattern`. If the pattern option + is used, it might skew the compression ratio slightly. Note that this is per + block size unit, for file/disk wide compression level that matches this + setting, you'll also want to set :option:`refill_buffers`. + +.. option:: buffer_compress_chunk=int + + See :option:`buffer_compress_percentage`. This setting allows fio to manage + how big the ranges of random data and zeroed data is. Without this set, fio + will provide :option:`buffer_compress_percentage` of blocksize random data, + followed by the remaining zeroed. With this set to some chunk size smaller + than the block size, fio can alternate random and zeroed data throughout the + I/O buffer. + +.. option:: buffer_pattern=str + + If set, fio will fill the I/O buffers with this pattern or with the contents + of a file. If not set, the contents of I/O buffers are defined by the other + options related to buffer contents. The setting can be any pattern of bytes, + and can be prefixed with 0x for hex values. It may also be a string, where + the string must then be wrapped with ``""``. Or it may also be a filename, + where the filename must be wrapped with ``''`` in which case the file is + opened and read. Note that not all the file contents will be read if that + would cause the buffers to overflow. So, for example:: + + buffer_pattern='filename' + + or:: + + buffer_pattern="abcd" + + or:: + + buffer_pattern=-12 + + or:: + + buffer_pattern=0xdeadface + + Also you can combine everything together in any order:: + + buffer_pattern=0xdeadface"abcd"-12'filename' + +.. option:: dedupe_percentage=int + + If set, fio will generate this percentage of identical buffers when + writing. These buffers will be naturally dedupable. The contents of the + buffers depend on what other buffer compression settings have been set. It's + possible to have the individual buffers either fully compressible, or not at + all. This option only controls the distribution of unique buffers. + +.. option:: invalidate=bool + + Invalidate the buffer/page cache parts of the files to be used prior to + starting I/O if the platform and file type support it. Defaults to true. + This will be ignored if :option:`pre_read` is also specified for the + same job. + +.. option:: sync=bool + + Use synchronous I/O for buffered writes. For the majority of I/O engines, + this means using O_SYNC. Default: false. + +.. option:: iomem=str, mem=str + + Fio can use various types of memory as the I/O unit buffer. The allowed + values are: + + **malloc** + Use memory from :manpage:`malloc(3)` as the buffers. Default memory + type. + + **shm** + Use shared memory as the buffers. Allocated through + :manpage:`shmget(2)`. + + **shmhuge** + Same as shm, but use huge pages as backing. + + **mmap** + Use :manpage:`mmap(2)` to allocate buffers. May either be anonymous memory, or can + be file backed if a filename is given after the option. The format + is `mem=mmap:/path/to/file`. + + **mmaphuge** + Use a memory mapped huge file as the buffer backing. Append filename + after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file`. + + **mmapshared** + Same as mmap, but use a MMAP_SHARED mapping. + + **cudamalloc** + Use GPU memory as the buffers for GPUDirect RDMA benchmark. + The :option:`ioengine` must be `rdma`. + + The area allocated is a function of the maximum allowed bs size for the job, + multiplied by the I/O depth given. Note that for **shmhuge** and + **mmaphuge** to work, the system must have free huge pages allocated. This + can normally be checked and set by reading/writing + :file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page + is 4MiB in size. So to calculate the number of huge pages you need for a + given job file, add up the I/O depth of all jobs (normally one unless + :option:`iodepth` is used) and multiply by the maximum bs set. Then divide + that number by the huge page size. You can see the size of the huge pages in + :file:`/proc/meminfo`. If no huge pages are allocated by having a non-zero + number in `nr_hugepages`, using **mmaphuge** or **shmhuge** will fail. Also + see :option:`hugepage-size`. + + **mmaphuge** also needs to have hugetlbfs mounted and the file location + should point there. So if it's mounted in :file:`/huge`, you would use + `mem=mmaphuge:/huge/somefile`. + +.. option:: iomem_align=int, mem_align=int + + This indicates the memory alignment of the I/O memory buffers. Note that + the given alignment is applied to the first I/O unit buffer, if using + :option:`iodepth` the alignment of the following buffers are given by the + :option:`bs` used. In other words, if using a :option:`bs` that is a + multiple of the page sized in the system, all buffers will be aligned to + this value. If using a :option:`bs` that is not page aligned, the alignment + of subsequent I/O memory buffers is the sum of the :option:`iomem_align` and + :option:`bs` used. + +.. option:: hugepage-size=int + + Defines the size of a huge page. Must at least be equal to the system + setting, see :file:`/proc/meminfo`. Defaults to 4MiB. Should probably + always be a multiple of megabytes, so using ``hugepage-size=Xm`` is the + preferred way to set this to avoid setting a non-pow-2 bad value. + +.. option:: lockmem=int + + Pin the specified amount of memory with :manpage:`mlock(2)`. Can be used to + simulate a smaller amount of memory. The amount specified is per worker. + + +I/O size +~~~~~~~~ + +.. option:: size=int + + The total size of file I/O for each thread of this job. Fio will run until + this many bytes has been transferred, unless runtime is limited by other options + (such as :option:`runtime`, for instance, or increased/decreased by :option:`io_size`). + Fio will divide this size between the available files determined by options + such as :option:`nrfiles`, :option:`filename`, unless :option:`filesize` is + specified by the job. If the result of division happens to be 0, the size is + set to the physical size of the given files or devices if they exist. + If this option is not specified, fio will use the full size of the given + files or devices. If the files do not exist, size must be given. It is also + possible to give size as a percentage between 1 and 100. If ``size=20%`` is + given, fio will use 20% of the full size of the given files or devices. + Can be combined with :option:`offset` to constrain the start and end range + that I/O will be done within. + +.. option:: io_size=int, io_limit=int + + Normally fio operates within the region set by :option:`size`, which means + that the :option:`size` option sets both the region and size of I/O to be + performed. Sometimes that is not what you want. With this option, it is + possible to define just the amount of I/O that fio should do. For instance, + if :option:`size` is set to 20GiB and :option:`io_size` is set to 5GiB, fio + will perform I/O within the first 20GiB but exit when 5GiB have been + done. The opposite is also possible -- if :option:`size` is set to 20GiB, + and :option:`io_size` is set to 40GiB, then fio will do 40GiB of I/O within + the 0..20GiB region. + +.. option:: filesize=irange(int) + + Individual file sizes. May be a range, in which case fio will select sizes + for files at random within the given range and limited to :option:`size` in + total (if that is given). If not given, each created file is the same size. + This option overrides :option:`size` in terms of file size, which means + this value is used as a fixed size or possible range of each file. + +.. option:: file_append=bool + + Perform I/O after the end of the file. Normally fio will operate within the + size of a file. If this option is set, then fio will append to the file + instead. This has identical behavior to setting :option:`offset` to the size + of a file. This option is ignored on non-regular files. + +.. option:: fill_device=bool, fill_fs=bool + + Sets size to something really large and waits for ENOSPC (no space left on + device) as the terminating condition. Only makes sense with sequential + write. For a read workload, the mount point will be filled first then I/O + started on the result. This option doesn't make sense if operating on a raw + device node, since the size of that is already known by the file system. + Additionally, writing beyond end-of-device will not return ENOSPC there. + + +I/O engine +~~~~~~~~~~ + +.. option:: ioengine=str + + Defines how the job issues I/O to the file. The following types are defined: + + **sync** + Basic :manpage:`read(2)` or :manpage:`write(2)` + I/O. :manpage:`lseek(2)` is used to position the I/O location. + See :option:`fsync` and :option:`fdatasync` for syncing write I/Os. + + **psync** + Basic :manpage:`pread(2)` or :manpage:`pwrite(2)` I/O. Default on + all supported operating systems except for Windows. + + **vsync** + Basic :manpage:`readv(2)` or :manpage:`writev(2)` I/O. Will emulate + queuing by coalescing adjacent I/Os into a single submission. + + **pvsync** + Basic :manpage:`preadv(2)` or :manpage:`pwritev(2)` I/O. + + **pvsync2** + Basic :manpage:`preadv2(2)` or :manpage:`pwritev2(2)` I/O. + + **libaio** + Linux native asynchronous I/O. Note that Linux may only support + queued behavior with non-buffered I/O (set ``direct=1`` or + ``buffered=0``). + This engine defines engine specific options. + + **posixaio** + POSIX asynchronous I/O using :manpage:`aio_read(3)` and + :manpage:`aio_write(3)`. + + **solarisaio** + Solaris native asynchronous I/O. + + **windowsaio** + Windows native asynchronous I/O. Default on Windows. + + **mmap** + File is memory mapped with :manpage:`mmap(2)` and data copied + to/from using :manpage:`memcpy(3)`. + + **splice** + :manpage:`splice(2)` is used to transfer the data and + :manpage:`vmsplice(2)` to transfer data from user space to the + kernel. + + **sg** + SCSI generic sg v3 I/O. May either be synchronous using the SG_IO + ioctl, or if the target is an sg character device we use + :manpage:`read(2)` and :manpage:`write(2)` for asynchronous + I/O. Requires :option:`filename` option to specify either block or + character devices. + + **null** + Doesn't transfer any data, just pretends to. This is mainly used to + exercise fio itself and for debugging/testing purposes. + + **net** + Transfer over the network to given ``host:port``. Depending on the + :option:`protocol` used, the :option:`hostname`, :option:`port`, + :option:`listen` and :option:`filename` options are used to specify + what sort of connection to make, while the :option:`protocol` option + determines which protocol will be used. This engine defines engine + specific options. + + **netsplice** + Like **net**, but uses :manpage:`splice(2)` and + :manpage:`vmsplice(2)` to map data and send/receive. + This engine defines engine specific options. + + **cpuio** + Doesn't transfer any data, but burns CPU cycles according to the + :option:`cpuload` and :option:`cpuchunks` options. Setting + :option:`cpuload`\=85 will cause that job to do nothing but burn 85% + of the CPU. In case of SMP machines, use :option:`numjobs`= + to get desired CPU usage, as the cpuload only loads a + single CPU at the desired rate. A job never finishes unless there is + at least one non-cpuio job. + + **guasi** + The GUASI I/O engine is the Generic Userspace Asyncronous Syscall + Interface approach to async I/O. See + + http://www.xmailserver.org/guasi-lib.html + + for more info on GUASI. + + **rdma** + The RDMA I/O engine supports both RDMA memory semantics + (RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the + InfiniBand, RoCE and iWARP protocols. + + **falloc** + I/O engine that does regular fallocate to simulate data transfer as + fio ioengine. + + DDIR_READ + does fallocate(,mode = FALLOC_FL_KEEP_SIZE,). + + DDIR_WRITE + does fallocate(,mode = 0). + + DDIR_TRIM + does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE). + + **ftruncate** + I/O engine that sends :manpage:`ftruncate(2)` operations in response + to write (DDIR_WRITE) events. Each ftruncate issued sets the file's + size to the current block offset. :option:`blocksize` is ignored. + + **e4defrag** + I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate + defragment activity in request to DDIR_WRITE event. + + **rbd** + I/O engine supporting direct access to Ceph Rados Block Devices + (RBD) via librbd without the need to use the kernel rbd driver. This + ioengine defines engine specific options. + + **gfapi** + Using GlusterFS libgfapi sync interface to direct access to + GlusterFS volumes without having to go through FUSE. This ioengine + defines engine specific options. + + **gfapi_async** + Using GlusterFS libgfapi async interface to direct access to + GlusterFS volumes without having to go through FUSE. This ioengine + defines engine specific options. + + **libhdfs** + Read and write through Hadoop (HDFS). The :option:`filename` option + is used to specify host,port of the hdfs name-node to connect. This + engine interprets offsets a little differently. In HDFS, files once + created cannot be modified so random writes are not possible. To + imitate this the libhdfs engine expects a bunch of small files to be + created over HDFS and will randomly pick a file from them + based on the offset generated by fio backend (see the example + job file to create such files, use ``rw=write`` option). Please + note, it may be necessary to set environment variables to work + with HDFS/libhdfs properly. Each job uses its own connection to + HDFS. + + **mtd** + Read, write and erase an MTD character device (e.g., + :file:`/dev/mtd0`). Discards are treated as erases. Depending on the + underlying device type, the I/O may have to go in a certain pattern, + e.g., on NAND, writing sequentially to erase blocks and discarding + before overwriting. The `trimwrite` mode works well for this + constraint. + + **pmemblk** + Read and write using filesystem DAX to a file on a filesystem + mounted with DAX on a persistent memory device through the NVML + libpmemblk library. + + **dev-dax** + Read and write using device DAX to a persistent memory device (e.g., + /dev/dax0.0) through the NVML libpmem library. + + **external** + Prefix to specify loading an external I/O engine object file. Append + the engine filename, e.g. ``ioengine=external:/tmp/foo.o`` to load + ioengine :file:`foo.o` in :file:`/tmp`. The path can be either + absolute or relative. See :file:`engines/skeleton_external.c` for + details of writing an external I/O engine. + + +I/O engine specific parameters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In addition, there are some parameters which are only valid when a specific +:option:`ioengine` is in use. These are used identically to normal parameters, +with the caveat that when used on the command line, they must come after the +:option:`ioengine` that defines them is selected. + +.. option:: userspace_reap : [libaio] + + Normally, with the libaio engine in use, fio will use the + :manpage:`io_getevents(2)` system call to reap newly returned events. With + this flag turned on, the AIO ring will be read directly from user-space to + reap events. The reaping mode is only enabled when polling for a minimum of + 0 events (e.g. when :option:`iodepth_batch_complete` `=0`). + +.. option:: hipri : [pvsync2] + + Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority + than normal. + +.. option:: hipri_percentage : [pvsync2] + + When hipri is set this determines the probability of a pvsync2 I/O being high + priority. The default is 100%. + +.. option:: cpuload=int : [cpuio] + + Attempt to use the specified percentage of CPU cycles. This is a mandatory + option when using cpuio I/O engine. + +.. option:: cpuchunks=int : [cpuio] + + Split the load into cycles of the given time. In microseconds. + +.. option:: exit_on_io_done=bool : [cpuio] + + Detect when I/O threads are done, then exit. + +.. option:: namenode=str : [libhdfs] + + The hostname or IP address of a HDFS cluster namenode to contact. + +.. option:: port=int + + [libhdfs] + + The listening port of the HFDS cluster namenode. + + [netsplice], [net] + + The TCP or UDP port to bind to or connect to. If this is used with + :option:`numjobs` to spawn multiple instances of the same job type, then + this will be the starting port number since fio will use a range of + ports. + +.. option:: hostname=str : [netsplice] [net] + + The hostname or IP address to use for TCP or UDP based I/O. If the job is + a TCP listener or UDP reader, the hostname is not used and must be omitted + unless it is a valid UDP multicast address. + +.. option:: interface=str : [netsplice] [net] + + The IP address of the network interface used to send or receive UDP + multicast. + +.. option:: ttl=int : [netsplice] [net] + + Time-to-live value for outgoing UDP multicast packets. Default: 1. + +.. option:: nodelay=bool : [netsplice] [net] + + Set TCP_NODELAY on TCP connections. + +.. option:: protocol=str, proto=str : [netsplice] [net] + + The network protocol to use. Accepted values are: + + **tcp** + Transmission control protocol. + **tcpv6** + Transmission control protocol V6. + **udp** + User datagram protocol. + **udpv6** + User datagram protocol V6. + **unix** + UNIX domain socket. + + When the protocol is TCP or UDP, the port must also be given, as well as the + hostname if the job is a TCP listener or UDP reader. For unix sockets, the + normal :option:`filename` option should be used and the port is invalid. + +.. option:: listen : [netsplice] [net] + + For TCP network connections, tell fio to listen for incoming connections + rather than initiating an outgoing connection. The :option:`hostname` must + be omitted if this option is used. + +.. option:: pingpong : [netsplice] [net] + + Normally a network writer will just continue writing data, and a network + reader will just consume packages. If ``pingpong=1`` is set, a writer will + send its normal payload to the reader, then wait for the reader to send the + same payload back. This allows fio to measure network latencies. The + submission and completion latencies then measure local time spent sending or + receiving, and the completion latency measures how long it took for the + other end to receive and send back. For UDP multicast traffic + ``pingpong=1`` should only be set for a single reader when multiple readers + are listening to the same address. + +.. option:: window_size : [netsplice] [net] + + Set the desired socket buffer size for the connection. + +.. option:: mss : [netsplice] [net] + + Set the TCP maximum segment size (TCP_MAXSEG). + +.. option:: donorname=str : [e4defrag] + + File will be used as a block donor (swap extents between files). + +.. option:: inplace=int : [e4defrag] + + Configure donor file blocks allocation strategy: + + **0** + Default. Preallocate donor's file on init. + **1** + Allocate space immediately inside defragment event, and free right + after event. + +.. option:: clustername=str : [rbd] + + Specifies the name of the Ceph cluster. + +.. option:: rbdname=str : [rbd] + + Specifies the name of the RBD. + +.. option:: pool=str : [rbd] + + Specifies the name of the Ceph pool containing RBD. + +.. option:: clientname=str : [rbd] + + Specifies the username (without the 'client.' prefix) used to access the + Ceph cluster. If the *clustername* is specified, the *clientname* shall be + the full *type.id* string. If no type. prefix is given, fio will add + 'client.' by default. + +.. option:: skip_bad=bool : [mtd] + + Skip operations against known bad blocks. + +.. option:: hdfsdirectory : [libhdfs] + + libhdfs will create chunk in this HDFS directory. + +.. option:: chunk_size : [libhdfs] + + The size of the chunk to use for each file. + + +I/O depth +~~~~~~~~~ + +.. option:: iodepth=int + + Number of I/O units to keep in flight against the file. Note that + increasing *iodepth* beyond 1 will not affect synchronous ioengines (except + for small degrees when :option:`verify_async` is in use). Even async + engines may impose OS restrictions causing the desired depth not to be + achieved. This may happen on Linux when using libaio and not setting + :option:`direct`\=1, since buffered I/O is not async on that OS. Keep an + eye on the I/O depth distribution in the fio output to verify that the + achieved depth is as expected. Default: 1. + +.. option:: iodepth_batch_submit=int, iodepth_batch=int + + This defines how many pieces of I/O to submit at once. It defaults to 1 + which means that we submit each I/O as soon as it is available, but can be + raised to submit bigger batches of I/O at the time. If it is set to 0 the + :option:`iodepth` value will be used. + +.. option:: iodepth_batch_complete_min=int, iodepth_batch_complete=int + + This defines how many pieces of I/O to retrieve at once. It defaults to 1 + which means that we'll ask for a minimum of 1 I/O in the retrieval process + from the kernel. The I/O retrieval will go on until we hit the limit set by + :option:`iodepth_low`. If this variable is set to 0, then fio will always + check for completed events before queuing more I/O. This helps reduce I/O + latency, at the cost of more retrieval system calls. + +.. option:: iodepth_batch_complete_max=int + + This defines maximum pieces of I/O to retrieve at once. This variable should + be used along with :option:`iodepth_batch_complete_min`\=int variable, + specifying the range of min and max amount of I/O which should be + retrieved. By default it is equal to the :option:`iodepth_batch_complete_min` + value. + + Example #1:: + + iodepth_batch_complete_min=1 + iodepth_batch_complete_max= + + which means that we will retrieve at least 1 I/O and up to the whole + submitted queue depth. If none of I/O has been completed yet, we will wait. + + Example #2:: + + iodepth_batch_complete_min=0 + iodepth_batch_complete_max= + + which means that we can retrieve up to the whole submitted queue depth, but + if none of I/O has been completed yet, we will NOT wait and immediately exit + the system call. In this example we simply do polling. + +.. option:: iodepth_low=int + + The low water mark indicating when to start filling the queue + again. Defaults to the same as :option:`iodepth`, meaning that fio will + attempt to keep the queue full at all times. If :option:`iodepth` is set to + e.g. 16 and *iodepth_low* is set to 4, then after fio has filled the queue of + 16 requests, it will let the depth drain down to 4 before starting to fill + it again. + +.. option:: serialize_overlap=bool + + Serialize in-flight I/Os that might otherwise cause or suffer from data races. + When two or more I/Os are submitted simultaneously, there is no guarantee that + the I/Os will be processed or completed in the submitted order. Further, if + two or more of those I/Os are writes, any overlapping region between them can + become indeterminate/undefined on certain storage. These issues can cause + verification to fail erratically when at least one of the racing I/Os is + changing data and the overlapping region has a non-zero size. Setting + ``serialize_overlap`` tells fio to avoid provoking this behavior by explicitly + serializing in-flight I/Os that have a non-zero overlap. Note that setting + this option can reduce both performance and the `:option:iodepth` achieved. + Additionally this option does not work when :option:`io_submit_mode` is set to + offload. Default: false. + +.. option:: io_submit_mode=str + + This option controls how fio submits the I/O to the I/O engine. The default + is `inline`, which means that the fio job threads submit and reap I/O + directly. If set to `offload`, the job threads will offload I/O submission + to a dedicated pool of I/O threads. This requires some coordination and thus + has a bit of extra overhead, especially for lower queue depth I/O where it + can increase latencies. The benefit is that fio can manage submission rates + independently of the device completion rates. This avoids skewed latency + reporting if I/O gets backed up on the device side (the coordinated omission + problem). + + +I/O rate +~~~~~~~~ + +.. option:: thinktime=time + + Stall the job for the specified period of time after an I/O has completed before issuing the + next. May be used to simulate processing being done by an application. + When the unit is omitted, the value is interpreted in microseconds. See + :option:`thinktime_blocks` and :option:`thinktime_spin`. + +.. option:: thinktime_spin=time + + Only valid if :option:`thinktime` is set - pretend to spend CPU time doing + something with the data received, before falling back to sleeping for the + rest of the period specified by :option:`thinktime`. When the unit is + omitted, the value is interpreted in microseconds. + +.. option:: thinktime_blocks=int + + Only valid if :option:`thinktime` is set - control how many blocks to issue, + before waiting :option:`thinktime` usecs. If not set, defaults to 1 which will make + fio wait :option:`thinktime` usecs after every block. This effectively makes any + queue depth setting redundant, since no more than 1 I/O will be queued + before we have to complete it and do our :option:`thinktime`. In other words, this + setting effectively caps the queue depth if the latter is larger. + +.. option:: rate=int[,int][,int] + + Cap the bandwidth used by this job. The number is in bytes/sec, the normal + suffix rules apply. Comma-separated values may be specified for reads, + writes, and trims as described in :option:`blocksize`. + + For example, using `rate=1m,500k` would limit reads to 1MiB/sec and writes to + 500KiB/sec. Capping only reads or writes can be done with `rate=,500k` or + `rate=500k,` where the former will only limit writes (to 500KiB/sec) and the + latter will only limit reads. + +.. option:: rate_min=int[,int][,int] + + Tell fio to do whatever it can to maintain at least this bandwidth. Failing + to meet this requirement will cause the job to exit. Comma-separated values + may be specified for reads, writes, and trims as described in + :option:`blocksize`. + +.. option:: rate_iops=int[,int][,int] + + Cap the bandwidth to this number of IOPS. Basically the same as + :option:`rate`, just specified independently of bandwidth. If the job is + given a block size range instead of a fixed value, the smallest block size + is used as the metric. Comma-separated values may be specified for reads, + writes, and trims as described in :option:`blocksize`. + +.. option:: rate_iops_min=int[,int][,int] + + If fio doesn't meet this rate of I/O, it will cause the job to exit. + Comma-separated values may be specified for reads, writes, and trims as + described in :option:`blocksize`. + +.. option:: rate_process=str + + This option controls how fio manages rated I/O submissions. The default is + `linear`, which submits I/O in a linear fashion with fixed delays between + I/Os that gets adjusted based on I/O completion rates. If this is set to + `poisson`, fio will submit I/O based on a more real world random request + flow, known as the Poisson process + (https://en.wikipedia.org/wiki/Poisson_point_process). The lambda will be + 10^6 / IOPS for the given workload. + + +I/O latency +~~~~~~~~~~~ + +.. option:: latency_target=time + + If set, fio will attempt to find the max performance point that the given + workload will run at while maintaining a latency below this target. When + the unit is omitted, the value is interpreted in microseconds. See + :option:`latency_window` and :option:`latency_percentile`. + +.. option:: latency_window=time + + Used with :option:`latency_target` to specify the sample window that the job + is run at varying queue depths to test the performance. When the unit is + omitted, the value is interpreted in microseconds. + +.. option:: latency_percentile=float + + The percentage of I/Os that must fall within the criteria specified by + :option:`latency_target` and :option:`latency_window`. If not set, this + defaults to 100.0, meaning that all I/Os must be equal or below to the value + set by :option:`latency_target`. + +.. option:: max_latency=time + + If set, fio will exit the job with an ETIMEDOUT error if it exceeds this + maximum latency. When the unit is omitted, the value is interpreted in + microseconds. + +.. option:: rate_cycle=int + + Average bandwidth for :option:`rate` and :option:`rate_min` over this number + of milliseconds. Defaults to 1000. + + +I/O replay +~~~~~~~~~~ + +.. option:: write_iolog=str + + Write the issued I/O patterns to the specified file. See + :option:`read_iolog`. Specify a separate file for each job, otherwise the + iologs will be interspersed and the file may be corrupt. + +.. option:: read_iolog=str + + Open an iolog with the specified filename and replay the I/O patterns it + contains. This can be used to store a workload and replay it sometime + later. The iolog given may also be a blktrace binary file, which allows fio + to replay a workload captured by :command:`blktrace`. See + :manpage:`blktrace(8)` for how to capture such logging data. For blktrace + replay, the file needs to be turned into a blkparse binary data file first + (``blkparse -o /dev/null -d file_for_fio.bin``). + +.. option:: replay_no_stall=bool + + When replaying I/O with :option:`read_iolog` the default behavior is to + attempt to respect the timestamps within the log and replay them with the + appropriate delay between IOPS. By setting this variable fio will not + respect the timestamps and attempt to replay them as fast as possible while + still respecting ordering. The result is the same I/O pattern to a given + device, but different timings. + +.. option:: replay_redirect=str + + While replaying I/O patterns using :option:`read_iolog` the default behavior + is to replay the IOPS onto the major/minor device that each IOP was recorded + from. This is sometimes undesirable because on a different machine those + major/minor numbers can map to a different device. Changing hardware on the + same system can also result in a different major/minor mapping. + ``replay_redirect`` causes all I/Os to be replayed onto the single specified + device regardless of the device it was recorded + from. i.e. :option:`replay_redirect`\= :file:`/dev/sdc` would cause all I/O + in the blktrace or iolog to be replayed onto :file:`/dev/sdc`. This means + multiple devices will be replayed onto a single device, if the trace + contains multiple devices. If you want multiple devices to be replayed + concurrently to multiple redirected devices you must blkparse your trace + into separate traces and replay them with independent fio invocations. + Unfortunately this also breaks the strict time ordering between multiple + device accesses. + +.. option:: replay_align=int + + Force alignment of I/O offsets and lengths in a trace to this power of 2 + value. + +.. option:: replay_scale=int + + Scale sector offsets down by this factor when replaying traces. + + +Threads, processes and job synchronization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. option:: thread + + Fio defaults to creating jobs by using fork, however if this option is + given, fio will create jobs by using POSIX Threads' function + :manpage:`pthread_create(3)` to create threads instead. + +.. option:: wait_for=str + + If set, the current job won't be started until all workers of the specified + waitee job are done. + + ``wait_for`` operates on the job name basis, so there are a few + limitations. First, the waitee must be defined prior to the waiter job + (meaning no forward references). Second, if a job is being referenced as a + waitee, it must have a unique name (no duplicate waitees). + +.. option:: nice=int + + Run the job with the given nice value. See man :manpage:`nice(2)`. + + On Windows, values less than -15 set the process class to "High"; -1 through + -15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle" + priority class. + +.. option:: prio=int + + Set the I/O priority value of this job. Linux limits us to a positive value + between 0 and 7, with 0 being the highest. See man + :manpage:`ionice(1)`. Refer to an appropriate manpage for other operating + systems since meaning of priority may differ. + +.. option:: prioclass=int + + Set the I/O priority class. See man :manpage:`ionice(1)`. + +.. option:: cpumask=int + + Set the CPU affinity of this job. The parameter given is a bit mask of + allowed CPUs the job may run on. So if you want the allowed CPUs to be 1 + and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man + :manpage:`sched_setaffinity(2)`. This may not work on all supported + operating systems or kernel versions. This option doesn't work well for a + higher CPU count than what you can store in an integer mask, so it can only + control cpus 1-32. For boxes with larger CPU counts, use + :option:`cpus_allowed`. + +.. option:: cpus_allowed=str + + Controls the same options as :option:`cpumask`, but accepts a textual + specification of the permitted CPUs instead. So to use CPUs 1 and 5 you + would specify ``cpus_allowed=1,5``. This option also allows a range of CPUs + to be specified -- say you wanted a binding to CPUs 1, 5, and 8 to 15, you + would set ``cpus_allowed=1,5,8-15``. + +.. option:: cpus_allowed_policy=str + + Set the policy of how fio distributes the CPUs specified by + :option:`cpus_allowed` or :option:`cpumask`. Two policies are supported: + + **shared** + All jobs will share the CPU set specified. + **split** + Each job will get a unique CPU from the CPU set. + + **shared** is the default behavior, if the option isn't specified. If + **split** is specified, then fio will will assign one cpu per job. If not + enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs + in the set. + +.. option:: numa_cpu_nodes=str + + Set this job running on specified NUMA nodes' CPUs. The arguments allow + comma delimited list of cpu numbers, A-B ranges, or `all`. Note, to enable + NUMA options support, fio must be built on a system with libnuma-dev(el) + installed. + +.. option:: numa_mem_policy=str + + Set this job's memory policy and corresponding NUMA nodes. Format of the + arguments:: + + [:] + + ``mode`` is one of the following memory poicies: ``default``, ``prefer``, + ``bind``, ``interleave`` or ``local``. For ``default`` and ``local`` memory + policies, no node needs to be specified. For ``prefer``, only one node is + allowed. For ``bind`` and ``interleave`` the ``nodelist`` may be as + follows: a comma delimited list of numbers, A-B ranges, or `all`. + +.. option:: cgroup=str + + Add job to this control group. If it doesn't exist, it will be created. The + system must have a mounted cgroup blkio mount point for this to work. If + your system doesn't have it mounted, you can do so with:: + + # mount -t cgroup -o blkio none /cgroup + +.. option:: cgroup_weight=int + + Set the weight of the cgroup to this value. See the documentation that comes + with the kernel, allowed values are in the range of 100..1000. + +.. option:: cgroup_nodelete=bool + + Normally fio will delete the cgroups it has created after the job + completion. To override this behavior and to leave cgroups around after the + job completion, set ``cgroup_nodelete=1``. This can be useful if one wants + to inspect various cgroup files after job completion. Default: false. + +.. option:: flow_id=int + + The ID of the flow. If not specified, it defaults to being a global + flow. See :option:`flow`. + +.. option:: flow=int + + Weight in token-based flow control. If this value is used, then there is a + 'flow counter' which is used to regulate the proportion of activity between + two or more jobs. Fio attempts to keep this flow counter near zero. The + ``flow`` parameter stands for how much should be added or subtracted to the + flow counter on each iteration of the main I/O loop. That is, if one job has + ``flow=8`` and another job has ``flow=-1``, then there will be a roughly 1:8 + ratio in how much one runs vs the other. + +.. option:: flow_watermark=int + + The maximum value that the absolute value of the flow counter is allowed to + reach before the job must wait for a lower value of the counter. + +.. option:: flow_sleep=int + + The period of time, in microseconds, to wait after the flow watermark has + been exceeded before retrying operations. + +.. option:: stonewall, wait_for_previous + + Wait for preceding jobs in the job file to exit, before starting this + one. Can be used to insert serialization points in the job file. A stone + wall also implies starting a new reporting group, see + :option:`group_reporting`. + +.. option:: exitall + + By default, fio will continue running all other jobs when one job finishes + but sometimes this is not the desired action. Setting ``exitall`` will + instead make fio terminate all other jobs when one job finishes. + +.. option:: exec_prerun=str + + Before running this job, issue the command specified through + :manpage:`system(3)`. Output is redirected in a file called + :file:`jobname.prerun.txt`. + +.. option:: exec_postrun=str + + After the job completes, issue the command specified though + :manpage:`system(3)`. Output is redirected in a file called + :file:`jobname.postrun.txt`. + +.. option:: uid=int + + Instead of running as the invoking user, set the user ID to this value + before the thread/process does any work. + +.. option:: gid=int + + Set group ID, see :option:`uid`. + + +Verification +~~~~~~~~~~~~ + +.. option:: verify_only + + Do not perform specified workload, only verify data still matches previous + invocation of this workload. This option allows one to check data multiple + times at a later date without overwriting it. This option makes sense only + for workloads that write data, and does not support workloads with the + :option:`time_based` option set. + +.. option:: do_verify=bool + + Run the verify phase after a write phase. Only valid if :option:`verify` is + set. Default: true. + +.. option:: verify=str + + If writing to a file, fio can verify the file contents after each iteration + of the job. Each verification method also implies verification of special + header, which is written to the beginning of each block. This header also + includes meta information, like offset of the block, block number, timestamp + when block was written, etc. :option:`verify` can be combined with + :option:`verify_pattern` option. The allowed values are: + + **md5** + Use an md5 sum of the data area and store it in the header of + each block. + + **crc64** + Use an experimental crc64 sum of the data area and store it in the + header of each block. + + **crc32c** + Use a crc32c sum of the data area and store it in the header of + each block. This will automatically use hardware acceleration + (e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will + fall back to software crc32c if none is found. Generally the + fatest checksum fio supports when hardware accelerated. + + **crc32c-intel** + Synonym for crc32c. + + **crc32** + Use a crc32 sum of the data area and store it in the header of each + block. + + **crc16** + Use a crc16 sum of the data area and store it in the header of each + block. + + **crc7** + Use a crc7 sum of the data area and store it in the header of each + block. + + **xxhash** + Use xxhash as the checksum function. Generally the fastest software + checksum that fio supports. + + **sha512** + Use sha512 as the checksum function. -1. Overview -2. How fio works -3. Running fio -4. Job file format -5. Detailed list of parameters -6. Normal output -7. Terse output -8. Trace file format -9. CPU idleness profiling -10. Verification and triggers -11. Log File Formats - - -1.0 Overview and history ------------------------- -fio was originally written to save me the hassle of writing special test -case programs when I wanted to test a specific workload, either for -performance reasons or to find/reproduce a bug. The process of writing -such a test app can be tiresome, especially if you have to do it often. -Hence I needed a tool that would be able to simulate a given io workload -without resorting to writing a tailored test case again and again. - -A test work load is difficult to define, though. There can be any number -of processes or threads involved, and they can each be using their own -way of generating io. You could have someone dirtying large amounts of -memory in an memory mapped file, or maybe several threads issuing -reads using asynchronous io. fio needed to be flexible enough to -simulate both of these cases, and many more. + **sha256** + Use sha256 as the checksum function. -2.0 How fio works ------------------ -The first step in getting fio to simulate a desired io workload, is -writing a job file describing that specific setup. A job file may contain -any number of threads and/or files - the typical contents of the job file -is a global section defining shared parameters, and one or more job -sections describing the jobs involved. When run, fio parses this file -and sets everything up as described. If we break down a job from top to -bottom, it contains the following basic parameters: + **sha1** + Use optimized sha1 as the checksum function. - IO type Defines the io pattern issued to the file(s). - We may only be reading sequentially from this - file(s), or we may be writing randomly. Or even - mixing reads and writes, sequentially or randomly. + **sha3-224** + Use optimized sha3-224 as the checksum function. - Block size In how large chunks are we issuing io? This may be - a single value, or it may describe a range of - block sizes. + **sha3-256** + Use optimized sha3-256 as the checksum function. - IO size How much data are we going to be reading/writing. + **sha3-384** + Use optimized sha3-384 as the checksum function. - IO engine How do we issue io? We could be memory mapping the - file, we could be using regular read/write, we - could be using splice, async io, or even SG - (SCSI generic sg). + **sha3-512** + Use optimized sha3-512 as the checksum function. - IO depth If the io engine is async, how large a queuing - depth do we want to maintain? + **meta** + This option is deprecated, since now meta information is included in + generic verification header and meta verification happens by + default. For detailed information see the description of the + :option:`verify` setting. This option is kept because of + compatibility's sake with old configurations. Do not use it. - IO type Should we be doing buffered io, or direct/raw io? + **pattern** + Verify a strict pattern. Normally fio includes a header with some + basic information and checksumming, but if this option is set, only + the specific pattern set with :option:`verify_pattern` is verified. - Num files How many files are we spreading the workload over. + **null** + Only pretend to verify. Useful for testing internals with + :option:`ioengine`\=null, not for much else. - Num threads How many threads or processes should we spread - this workload over. + This option can be used for repeated burn-in tests of a system to make sure + that the written data is also correctly read back. If the data direction + given is a read or random read, fio will assume that it should verify a + previously written file. If the data direction includes any form of write, + the verify will be of the newly written data. -The above are the basic parameters defined for a workload, in addition -there's a multitude of parameters that modify other aspects of how this -job behaves. +.. option:: verifysort=bool + If true, fio will sort written verify blocks when it deems it faster to read + them back in a sorted manner. This is often the case when overwriting an + existing file, since the blocks are already laid out in the file system. You + can ignore this option unless doing huge amounts of really fast I/O where + the red-black tree sorting CPU time becomes significant. Default: true. -3.0 Running fio ---------------- -See the README file for command line parameters, there are only a few -of them. +.. option:: verifysort_nr=int -Running fio is normally the easiest part - you just give it the job file -(or job files) as parameters: + Pre-load and sort verify blocks for a read workload. -$ fio job_file +.. option:: verify_offset=int -and it will start doing what the job_file tells it to do. You can give -more than one job file on the command line, fio will serialize the running -of those files. Internally that is the same as using the 'stonewall' -parameter described in the parameter section. - -If the job file contains only one job, you may as well just give the -parameters on the command line. The command line parameters are identical -to the job parameters, with a few extra that control global parameters -(see README). For example, for the job file parameter iodepth=2, the -mirror command line option would be --iodepth 2 or --iodepth=2. You can -also use the command line for giving more than one job entry. For each ---name option that fio sees, it will start a new job with that name. -Command line entries following a --name entry will apply to that job, -until there are no more entries or a new --name entry is seen. This is -similar to the job file options, where each option applies to the current -job until a new [] job entry is seen. - -fio does not need to run as root, except if the files or devices specified -in the job section requires that. Some other options may also be restricted, -such as memory locking, io scheduler switching, and decreasing the nice value. + Swap the verification header with data somewhere else in the block before + writing. It is swapped back before verifying. +.. option:: verify_interval=int -4.0 Job file format -------------------- -As previously described, fio accepts one or more job files describing -what it is supposed to do. The job file format is the classic ini file, -where the names enclosed in [] brackets define the job name. You are free -to use any ascii name you want, except 'global' which has special meaning. -A global section sets defaults for the jobs described in that file. A job -may override a global section parameter, and a job file may even have -several global sections if so desired. A job is only affected by a global -section residing above it. If the first character in a line is a ';' or a -'#', the entire line is discarded as a comment. + Write the verification header at a finer granularity than the + :option:`blocksize`. It will be written for chunks the size of + ``verify_interval``. :option:`blocksize` should divide this evenly. -So let's look at a really simple job file that defines two processes, each -randomly reading from a 128MB file. +.. option:: verify_pattern=str -; -- start job file -- -[global] -rw=randread -size=128m + If set, fio will fill the I/O buffers with this pattern. Fio defaults to + filling with totally random bytes, but sometimes it's interesting to fill + with a known pattern for I/O verification purposes. Depending on the width + of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can + be either a decimal or a hex number). The ``verify_pattern`` if larger than + a 32-bit quantity has to be a hex number that starts with either "0x" or + "0X". Use with :option:`verify`. Also, ``verify_pattern`` supports %o + format, which means that for each block offset will be written and then + verified back, e.g.:: -[job1] + verify_pattern=%o -[job2] + Or use combination of everything:: -; -- end job file -- + verify_pattern=0xff%o"abcd"-12 -As you can see, the job file sections themselves are empty as all the -described parameters are shared. As no filename= option is given, fio -makes up a filename for each of the jobs as it sees fit. On the command -line, this job would look as follows: +.. option:: verify_fatal=bool -$ fio --name=global --rw=randread --size=128m --name=job1 --name=job2 + Normally fio will keep checking the entire contents before quitting on a + block verification failure. If this option is set, fio will exit the job on + the first observed failure. Default: false. +.. option:: verify_dump=bool -Let's look at an example that has a number of processes writing randomly -to files. + If set, dump the contents of both the original data block and the data block + we read off disk to files. This allows later analysis to inspect just what + kind of data corruption occurred. Off by default. -; -- start job file -- -[random-writers] -ioengine=libaio -iodepth=4 -rw=randwrite -bs=32k -direct=0 -size=64m -numjobs=4 - -; -- end job file -- - -Here we have no global section, as we only have one job defined anyway. -We want to use async io here, with a depth of 4 for each file. We also -increased the buffer size used to 32KB and define numjobs to 4 to -fork 4 identical jobs. The result is 4 processes each randomly writing -to their own 64MB file. Instead of using the above job file, you could -have given the parameters on the command line. For this case, you would -specify: +.. option:: verify_async=int -$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4 + Fio will normally verify I/O inline from the submitting thread. This option + takes an integer describing how many async offload threads to create for I/O + verification instead, causing fio to offload the duty of verifying I/O + contents to one or more separate threads. If using this offload option, even + sync I/O engines can benefit from using an :option:`iodepth` setting higher + than 1, as it allows them to have I/O in flight while verifies are running. + Defaults to 0 async threads, i.e. verification is not asynchronous. -When fio is utilized as a basis of any reasonably large test suite, it might be -desirable to share a set of standardized settings across multiple job files. -Instead of copy/pasting such settings, any section may pull in an external -.fio file with 'include filename' directive, as in the following example: +.. option:: verify_async_cpus=str -; -- start job file including.fio -- -[global] -filename=/tmp/test -filesize=1m -include glob-include.fio - -[test] -rw=randread -bs=4k -time_based=1 -runtime=10 -include test-include.fio -; -- end job file including.fio -- - -; -- start job file glob-include.fio -- -thread=1 -group_reporting=1 -; -- end job file glob-include.fio -- - -; -- start job file test-include.fio -- -ioengine=libaio -iodepth=4 -; -- end job file test-include.fio -- - -Settings pulled into a section apply to that section only (except global -section). Include directives may be nested in that any included file may -contain further include directive(s). Include files may not contain [] -sections. + Tell fio to set the given CPU affinity on the async I/O verification + threads. See :option:`cpus_allowed` for the format used. +.. option:: verify_backlog=int -4.1 Environment variables -------------------------- + Fio will normally verify the written contents of a job that utilizes verify + once that job has completed. In other words, everything is written then + everything is read back and verified. You may want to verify continually + instead for a variety of reasons. Fio stores the meta data associated with + an I/O block in memory, so for large verify workloads, quite a bit of memory + would be used up holding this meta data. If this option is enabled, fio will + write only N blocks before verifying these blocks. -fio also supports environment variable expansion in job files. Any -sub-string of the form "${VARNAME}" as part of an option value (in other -words, on the right of the `='), will be expanded to the value of the -environment variable called VARNAME. If no such environment variable -is defined, or VARNAME is the empty string, the empty string will be -substituted. +.. option:: verify_backlog_batch=int -As an example, let's look at a sample fio invocation and job file: + Control how many blocks fio will verify if :option:`verify_backlog` is + set. If not set, will default to the value of :option:`verify_backlog` + (meaning the entire queue is read back and verified). If + ``verify_backlog_batch`` is less than :option:`verify_backlog` then not all + blocks will be verified, if ``verify_backlog_batch`` is larger than + :option:`verify_backlog`, some blocks will be verified more than once. -$ SIZE=64m NUMJOBS=4 fio jobfile.fio +.. option:: verify_state_save=bool -; -- start job file -- -[random-writers] -rw=randwrite -size=${SIZE} -numjobs=${NUMJOBS} -; -- end job file -- + When a job exits during the write phase of a verify workload, save its + current state. This allows fio to replay up until that point, if the verify + state is loaded for the verify read phase. The format of the filename is, + roughly:: -This will expand to the following equivalent job file at runtime: + ---verify.state. -; -- start job file -- -[random-writers] -rw=randwrite -size=64m -numjobs=4 -; -- end job file -- + is "local" for a local run, "sock" for a client/server socket + connection, and "ip" (192.168.0.1, for instance) for a networked + client/server connection. Defaults to true. -fio ships with a few example job files, you can also look there for -inspiration. +.. option:: verify_state_load=bool -4.2 Reserved keywords ---------------------- + If a verify termination trigger was used, fio stores the current write state + of each thread. This can be used at verification time so that fio knows how + far it should verify. Without this information, fio will run a full + verification pass, according to the settings in the job file used. Default + false. -Additionally, fio has a set of reserved keywords that will be replaced -internally with the appropriate value. Those keywords are: +.. option:: trim_percentage=int -$pagesize The architecture page size of the running system -$mb_memory Megabytes of total memory in the system -$ncpus Number of online available CPUs + Number of verify blocks to discard/trim. -These can be used on the command line or in the job file, and will be -automatically substituted with the current system values when the job -is run. Simple math is also supported on these keywords, so you can -perform actions like: - -size=8*$mb_memory - -and get that properly expanded to 8 times the size of memory in the -machine. - - -5.0 Detailed list of parameters -------------------------------- - -This section describes in details each parameter associated with a job. -Some parameters take an option of a given type, such as an integer or -a string. Anywhere a numeric value is required, an arithmetic expression -may be used, provided it is surrounded by parentheses. Supported operators -are: - - addition (+) - subtraction (-) - multiplication (*) - division (/) - modulus (%) - exponentiation (^) +.. option:: trim_verify_zero=bool -For time values in expressions, units are microseconds by default. This is -different than for time values not in expressions (not enclosed in -parentheses). The following types are used: + Verify that trim/discarded blocks are returned as zeros. -str String. This is a sequence of alpha characters. -time Integer with possible time suffix. In seconds unless otherwise - specified, use eg 10m for 10 minutes. Accepts s/m/h for seconds, - minutes, and hours, and accepts 'ms' (or 'msec') for milliseconds, - and 'us' (or 'usec') for microseconds. -int SI integer. A whole number value, which may contain a suffix - describing the base of the number. Accepted suffixes are k/m/g/t/p, - meaning kilo, mega, giga, tera, and peta. The suffix is not case - sensitive, and you may also include trailing 'b' (eg 'kb' is the same - as 'k'). So if you want to specify 4096, you could either write - out '4096' or just give 4k. The suffixes signify base 2 values, so - 1024 is 1k and 1024k is 1m and so on, unless the suffix is explicitly - set to a base 10 value using 'kib', 'mib', 'gib', etc. If that is the - case, then 1000 is used as the multiplier. This can be handy for - disks, since manufacturers generally use base 10 values when listing - the capacity of a drive. If the option accepts an upper and lower - range, use a colon ':' or minus '-' to separate such values. May also - include a prefix to indicate numbers base. If 0x is used, the number - is assumed to be hexadecimal. See irange. -bool Boolean. Usually parsed as an integer, however only defined for - true and false (1 and 0). -irange Integer range with suffix. Allows value range to be given, such - as 1024-4096. A colon may also be used as the separator, eg - 1k:4k. If the option allows two sets of ranges, they can be - specified with a ',' or '/' delimiter: 1k-4k/8k-32k. Also see - int. -float_list A list of floating point numbers, separated by a ':' character. - -With the above in mind, here follows the complete list of fio job -parameters. - -name=str ASCII name of the job. This may be used to override the - name printed by fio for this job. Otherwise the job - name is used. On the command line this parameter has the - special purpose of also signaling the start of a new - job. - -wait_for=str Specifies the name of the already defined job to wait - for. Single waitee name only may be specified. If set, the job - won't be started until all workers of the waitee job are done. - - Wait_for operates on the job name basis, so there are a few - limitations. First, the waitee must be defined prior to the - waiter job (meaning no forward references). Second, if a job - is being referenced as a waitee, it must have a unique name - (no duplicate waitees). - -description=str Text description of the job. Doesn't do anything except - dump this text description when this job is run. It's - not parsed. - -directory=str Prefix filenames with this directory. Used to place files - in a different location than "./". See the 'filename' option - for escaping certain characters. - -filename=str Fio normally makes up a filename based on the job name, - thread number, and file number. If you want to share - files between threads in a job or several jobs, specify - a filename for each of them to override the default. - If the ioengine is file based, you can specify a number of - files by separating the names with a ':' colon. So if you - wanted a job to open /dev/sda and /dev/sdb as the two working - files, you would use filename=/dev/sda:/dev/sdb. On Windows, - disk devices are accessed as \\.\PhysicalDrive0 for the first - device, \\.\PhysicalDrive1 for the second etc. Note: Windows - and FreeBSD prevent write access to areas of the disk - containing in-use data (e.g. filesystems). - If the wanted filename does need to include a colon, then - escape that with a '\' character. For instance, if the filename - is "/dev/dsk/foo@3,0:c", then you would use - filename="/dev/dsk/foo@3,0\:c". '-' is a reserved name, meaning - stdin or stdout. Which of the two depends on the read/write - direction set. - -filename_format=str - If sharing multiple files between jobs, it is usually necessary - to have fio generate the exact names that you want. By default, - fio will name a file based on the default file format - specification of jobname.jobnumber.filenumber. With this - option, that can be customized. Fio will recognize and replace - the following keywords in this string: - - $jobname - The name of the worker thread or process. - - $jobnum - The incremental number of the worker thread or - process. - - $filenum - The incremental number of the file for that worker - thread or process. - - To have dependent jobs share a set of files, this option can - be set to have fio generate filenames that are shared between - the two. For instance, if testfiles.$filenum is specified, - file number 4 for any job will be named testfiles.4. The - default of $jobname.$jobnum.$filenum will be used if - no other format specifier is given. - -unique_filename=bool To avoid collisions between networked clients, fio - defaults to prefixing any generated filenames (with a directory - specified) with the source of the client connecting. To disable - this behavior, set this option to 0. - -opendir=str Tell fio to recursively add any file it can find in this - directory and down the file system tree. - -lockfile=str Fio defaults to not locking any files before it does - IO to them. If a file or file descriptor is shared, fio - can serialize IO to that file to make the end result - consistent. This is usual for emulating real workloads that - share files. The lock modes are: - - none No locking. The default. - exclusive Only one thread/process may do IO, - excluding all others. - readwrite Read-write locking on the file. Many - readers may access the file at the - same time, but writes get exclusive - access. - -readwrite=str -rw=str Type of io pattern. Accepted values are: - - read Sequential reads - write Sequential writes - randwrite Random writes - randread Random reads - rw,readwrite Sequential mixed reads and writes - randrw Random mixed reads and writes - trimwrite Mixed trims and writes. Blocks will be - trimmed first, then written to. - - Fio defaults to read if the option is not specified. - For the mixed io types, the default is to split them 50/50. - For certain types of io the result may still be skewed a bit, - since the speed may be different. It is possible to specify - a number of IO's to do before getting a new offset, this is - done by appending a ':' to the end of the string given. - For a random read, it would look like 'rw=randread:8' for - passing in an offset modifier with a value of 8. If the - suffix is used with a sequential IO pattern, then the value - specified will be added to the generated offset for each IO. - For instance, using rw=write:4k will skip 4k for every - write. It turns sequential IO into sequential IO with holes. - See the 'rw_sequencer' option. - -rw_sequencer=str If an offset modifier is given by appending a number to - the rw= line, then this option controls how that - number modifies the IO offset being generated. Accepted - values are: - - sequential Generate sequential offset - identical Generate the same offset - - 'sequential' is only useful for random IO, where fio would - normally generate a new random offset for every IO. If you - append eg 8 to randread, you would get a new random offset for - every 8 IO's. The result would be a seek for only every 8 - IO's, instead of for every IO. Use rw=randread:8 to specify - that. As sequential IO is already sequential, setting - 'sequential' for that would not result in any differences. - 'identical' behaves in a similar fashion, except it sends - the same offset 8 number of times before generating a new - offset. - -kb_base=int The base unit for a kilobyte. The defacto base is 2^10, 1024. - Storage manufacturers like to use 10^3 or 1000 as a base - ten unit instead, for obvious reasons. Allow values are - 1024 or 1000, with 1024 being the default. - -unified_rw_reporting=bool Fio normally reports statistics on a per - data direction basis, meaning that read, write, and trim are - accounted and reported separately. If this option is set, - the fio will sum the results and report them as "mixed" - instead. - -randrepeat=bool For random IO workloads, seed the generator in a predictable - way so that results are repeatable across repetitions. - Defaults to true. - -randseed=int Seed the random number generators based on this seed value, to - be able to control what sequence of output is being generated. - If not set, the random sequence depends on the randrepeat - setting. - -fallocate=str Whether pre-allocation is performed when laying down files. - Accepted values are: - - none Do not pre-allocate space - posix Pre-allocate via posix_fallocate() - keep Pre-allocate via fallocate() with - FALLOC_FL_KEEP_SIZE set - 0 Backward-compatible alias for 'none' - 1 Backward-compatible alias for 'posix' - - May not be available on all supported platforms. 'keep' is only - available on Linux.If using ZFS on Solaris this must be set to - 'none' because ZFS doesn't support it. Default: 'posix'. - -fadvise_hint=bool By default, fio will use fadvise() to advise the kernel - on what IO patterns it is likely to issue. Sometimes you - want to test specific IO patterns without telling the - kernel about it, in which case you can disable this option. - The following options are supported: - - sequential Use FADV_SEQUENTIAL - random Use FADV_RANDOM - 1 Backwards-compatible hint for basing - the hint on the fio workload. Will use - FADV_SEQUENTIAL for a sequential - workload, and FADV_RANDOM for a random - workload. - 0 Backwards-compatible setting for not - issing a fadvise hint. - -fadvise_stream=int Notify the kernel what write stream ID to place these - writes under. Only supported on Linux. Note, this option - may change going forward. - -size=int The total size of file io for this job. Fio will run until - this many bytes has been transferred, unless runtime is - limited by other options (such as 'runtime', for instance, - or increased/decreased by 'io_size'). Unless specific nrfiles - and filesize options are given, fio will divide this size - between the available files specified by the job. If not set, - fio will use the full size of the given files or devices. - If the files do not exist, size must be given. It is also - possible to give size as a percentage between 1 and 100. If - size=20% is given, fio will use 20% of the full size of the - given files or devices. - -io_size=int -io_limit=int Normally fio operates within the region set by 'size', which - means that the 'size' option sets both the region and size of - IO to be performed. Sometimes that is not what you want. With - this option, it is possible to define just the amount of IO - that fio should do. For instance, if 'size' is set to 20G and - 'io_size' is set to 5G, fio will perform IO within the first - 20G but exit when 5G have been done. The opposite is also - possible - if 'size' is set to 20G, and 'io_size' is set to - 40G, then fio will do 40G of IO within the 0..20G region. - -filesize=int Individual file sizes. May be a range, in which case fio - will select sizes for files at random within the given range - and limited to 'size' in total (if that is given). If not - given, each created file is the same size. - -file_append=bool Perform IO after the end of the file. Normally fio will - operate within the size of a file. If this option is set, then - fio will append to the file instead. This has identical - behavior to setting offset to the size of a file. This option - is ignored on non-regular files. - -fill_device=bool -fill_fs=bool Sets size to something really large and waits for ENOSPC (no - space left on device) as the terminating condition. Only makes - sense with sequential write. For a read workload, the mount - point will be filled first then IO started on the result. This - option doesn't make sense if operating on a raw device node, - since the size of that is already known by the file system. - Additionally, writing beyond end-of-device will not return - ENOSPC there. - -blocksize=int -bs=int The block size used for the io units. Defaults to 4k. Values - can be given for both read and writes. If a single int is - given, it will apply to both. If a second int is specified - after a comma, it will apply to writes only. In other words, - the format is either bs=read_and_write or bs=read,write,trim. - bs=4k,8k will thus use 4k blocks for reads, 8k blocks for - writes, and 8k for trims. You can terminate the list with - a trailing comma. bs=4k,8k, would use the default value for - trims.. If you only wish to set the write size, you - can do so by passing an empty read size - bs=,8k will set - 8k for writes and leave the read default value. - -blockalign=int -ba=int At what boundary to align random IO offsets. Defaults to - the same as 'blocksize' the minimum blocksize given. - Minimum alignment is typically 512b for using direct IO, - though it usually depends on the hardware block size. This - option is mutually exclusive with using a random map for - files, so it will turn off that option. - -blocksize_range=irange -bsrange=irange Instead of giving a single block size, specify a range - and fio will mix the issued io block sizes. The issued - io unit will always be a multiple of the minimum value - given (also see bs_unaligned). Applies to both reads and - writes, however a second range can be given after a comma. - See bs=. - -bssplit=str Sometimes you want even finer grained control of the - block sizes issued, not just an even split between them. - This option allows you to weight various block sizes, - so that you are able to define a specific amount of - block sizes issued. The format for this option is: - - bssplit=blocksize/percentage:blocksize/percentage - - for as many block sizes as needed. So if you want to define - a workload that has 50% 64k blocks, 10% 4k blocks, and - 40% 32k blocks, you would write: - - bssplit=4k/10:64k/50:32k/40 - - Ordering does not matter. If the percentage is left blank, - fio will fill in the remaining values evenly. So a bssplit - option like this one: - - bssplit=4k/50:1k/:32k/ - - would have 50% 4k ios, and 25% 1k and 32k ios. The percentages - always add up to 100, if bssplit is given a range that adds - up to more, it will error out. - - bssplit also supports giving separate splits to reads and - writes. The format is identical to what bs= accepts. You - have to separate the read and write parts with a comma. So - if you want a workload that has 50% 2k reads and 50% 4k reads, - while having 90% 4k writes and 10% 8k writes, you would - specify: - - bssplit=2k/50:4k/50,4k/90:8k/10 - -blocksize_unaligned -bs_unaligned If this option is given, any byte size value within bsrange - may be used as a block range. This typically wont work with - direct IO, as that normally requires sector alignment. - -bs_is_seq_rand If this option is set, fio will use the normal read,write - blocksize settings as sequential,random instead. Any random - read or write will use the WRITE blocksize settings, and any - sequential read or write will use the READ blocksize setting. - -zero_buffers If this option is given, fio will init the IO buffers to - all zeroes. The default is to fill them with random data. - -refill_buffers If this option is given, fio will refill the IO buffers - on every submit. The default is to only fill it at init - time and reuse that data. Only makes sense if zero_buffers - isn't specified, naturally. If data verification is enabled, - refill_buffers is also automatically enabled. - -scramble_buffers=bool If refill_buffers is too costly and the target is - using data deduplication, then setting this option will - slightly modify the IO buffer contents to defeat normal - de-dupe attempts. This is not enough to defeat more clever - block compression attempts, but it will stop naive dedupe of - blocks. Default: true. - -buffer_compress_percentage=int If this is set, then fio will attempt to - provide IO buffer content (on WRITEs) that compress to - the specified level. Fio does this by providing a mix of - random data and a fixed pattern. The fixed pattern is either - zeroes, or the pattern specified by buffer_pattern. If the - pattern option is used, it might skew the compression ratio - slightly. Note that this is per block size unit, for file/disk - wide compression level that matches this setting, you'll also - want to set refill_buffers. - -buffer_compress_chunk=int See buffer_compress_percentage. This - setting allows fio to manage how big the ranges of random - data and zeroed data is. Without this set, fio will - provide buffer_compress_percentage of blocksize random - data, followed by the remaining zeroed. With this set - to some chunk size smaller than the block size, fio can - alternate random and zeroed data throughout the IO - buffer. - -buffer_pattern=str If set, fio will fill the io buffers with this - pattern. If not set, the contents of io buffers is defined by - the other options related to buffer contents. The setting can - be any pattern of bytes, and can be prefixed with 0x for hex - values. It may also be a string, where the string must then - be wrapped with "", e.g.: +.. option:: trim_backlog=int - buffer_pattern="abcd" - or - buffer_pattern=-12 - or - buffer_pattern=0xdeadface + Trim after this number of blocks are written. - Also you can combine everything together in any order: - buffer_pattern=0xdeadface"abcd"-12 +.. option:: trim_backlog_batch=int -dedupe_percentage=int If set, fio will generate this percentage of - identical buffers when writing. These buffers will be - naturally dedupable. The contents of the buffers depend on - what other buffer compression settings have been set. It's - possible to have the individual buffers either fully - compressible, or not at all. This option only controls the - distribution of unique buffers. - -nrfiles=int Number of files to use for this job. Defaults to 1. - -openfiles=int Number of files to keep open at the same time. Defaults to - the same as nrfiles, can be set smaller to limit the number - simultaneous opens. - -file_service_type=str Defines how fio decides which file from a job to - service next. The following types are defined: - - random Just choose a file at random. - - roundrobin Round robin over open files. This - is the default. - - sequential Finish one file before moving on to - the next. Multiple files can still be - open depending on 'openfiles'. - - zipf Use a zipfian distribution to decide what file - to access. - - pareto Use a pareto distribution to decide what file - to access. - - gauss Use a gaussian (normal) distribution to decide - what file to access. - - For random, roundrobin, and sequential, a postfix can be - appended to tell fio how many I/Os to issue before switching - to a new file. For example, specifying - 'file_service_type=random:8' would cause fio to issue 8 I/Os - before selecting a new file at random. For the non-uniform - distributions, a floating point postfix can be given to - influence how the distribution is skewed. See - 'random_distribution' for a description of how that would work. - -ioengine=str Defines how the job issues io to the file. The following - types are defined: - - sync Basic read(2) or write(2) io. lseek(2) is - used to position the io location. - - psync Basic pread(2) or pwrite(2) io. Default on all - supported operating systems except for Windows. - - vsync Basic readv(2) or writev(2) IO. - - pvsync Basic preadv(2) or pwritev(2) IO. - - pvsync2 Basic preadv2(2) or pwritev2(2) IO. - - libaio Linux native asynchronous io. Note that Linux - may only support queued behaviour with - non-buffered IO (set direct=1 or buffered=0). - This engine defines engine specific options. - - posixaio glibc posix asynchronous io. - - solarisaio Solaris native asynchronous io. - - windowsaio Windows native asynchronous io. - Default on Windows. - - mmap File is memory mapped and data copied - to/from using memcpy(3). - - splice splice(2) is used to transfer the data and - vmsplice(2) to transfer data from user - space to the kernel. - - sg SCSI generic sg v3 io. May either be - synchronous using the SG_IO ioctl, or if - the target is an sg character device - we use read(2) and write(2) for asynchronous - io. - - null Doesn't transfer any data, just pretends - to. This is mainly used to exercise fio - itself and for debugging/testing purposes. - - net Transfer over the network to given host:port. - Depending on the protocol used, the hostname, - port, listen and filename options are used to - specify what sort of connection to make, while - the protocol option determines which protocol - will be used. - This engine defines engine specific options. - - netsplice Like net, but uses splice/vmsplice to - map data and send/receive. - This engine defines engine specific options. - - cpuio Doesn't transfer any data, but burns CPU - cycles according to the cpuload= and - cpuchunks= options. Setting cpuload=85 - will cause that job to do nothing but burn - 85% of the CPU. In case of SMP machines, - use numjobs= to get desired CPU - usage, as the cpuload only loads a single - CPU at the desired rate. A job never finishes - unless there is at least one non-cpuio job. - - guasi The GUASI IO engine is the Generic Userspace - Asyncronous Syscall Interface approach - to async IO. See - - http://www.xmailserver.org/guasi-lib.html - - for more info on GUASI. - - rdma The RDMA I/O engine supports both RDMA - memory semantics (RDMA_WRITE/RDMA_READ) and - channel semantics (Send/Recv) for the - InfiniBand, RoCE and iWARP protocols. - - falloc IO engine that does regular fallocate to - simulate data transfer as fio ioengine. - DDIR_READ does fallocate(,mode = keep_size,) - DDIR_WRITE does fallocate(,mode = 0) - DDIR_TRIM does fallocate(,mode = punch_hole) - - e4defrag IO engine that does regular EXT4_IOC_MOVE_EXT - ioctls to simulate defragment activity in - request to DDIR_WRITE event - - rbd IO engine supporting direct access to Ceph - Rados Block Devices (RBD) via librbd without - the need to use the kernel rbd driver. This - ioengine defines engine specific options. - - gfapi Using Glusterfs libgfapi sync interface to - direct access to Glusterfs volumes without - options. - - gfapi_async Using Glusterfs libgfapi async interface - to direct access to Glusterfs volumes without - having to go through FUSE. This ioengine - defines engine specific options. - - libhdfs Read and write through Hadoop (HDFS). - This engine interprets offsets a little - differently. In HDFS, files once created - cannot be modified. So random writes are not - possible. To imitate this, libhdfs engine - creates bunch of small files, and engine will - pick a file out of those files based on the - offset generated by fio backend. Each jobs uses - it's own connection to HDFS. - - mtd Read, write and erase an MTD character device - (e.g., /dev/mtd0). Discards are treated as - erases. Depending on the underlying device - type, the I/O may have to go in a certain - pattern, e.g., on NAND, writing sequentially - to erase blocks and discarding before - overwriting. The writetrim mode works well - for this constraint. - - pmemblk Read and write through the NVML libpmemblk - interface. - - dev-dax Read and write through a DAX device exposed - from persistent memory. - - external Prefix to specify loading an external - IO engine object file. Append the engine - filename, eg ioengine=external:/tmp/foo.o - to load ioengine foo.o in /tmp. - -iodepth=int This defines how many io units to keep in flight against - the file. The default is 1 for each file defined in this - job, can be overridden with a larger value for higher - concurrency. Note that increasing iodepth beyond 1 will not - affect synchronous ioengines (except for small degress when - verify_async is in use). Even async engines may impose OS - restrictions causing the desired depth not to be achieved. - This may happen on Linux when using libaio and not setting - direct=1, since buffered IO is not async on that OS. Keep an - eye on the IO depth distribution in the fio output to verify - that the achieved depth is as expected. Default: 1. - -iodepth_batch_submit=int -iodepth_batch=int This defines how many pieces of IO to submit at once. - It defaults to 1 which means that we submit each IO - as soon as it is available, but can be raised to submit - bigger batches of IO at the time. If it is set to 0 the iodepth - value will be used. - -iodepth_batch_complete_min=int -iodepth_batch_complete=int This defines how many pieces of IO to retrieve - at once. It defaults to 1 which means that we'll ask - for a minimum of 1 IO in the retrieval process from - the kernel. The IO retrieval will go on until we - hit the limit set by iodepth_low. If this variable is - set to 0, then fio will always check for completed - events before queuing more IO. This helps reduce - IO latency, at the cost of more retrieval system calls. - -iodepth_batch_complete_max=int This defines maximum pieces of IO to - retrieve at once. This variable should be used along with - iodepth_batch_complete_min=int variable, specifying the range - of min and max amount of IO which should be retrieved. By default - it is equal to iodepth_batch_complete_min value. + Trim this number of I/O blocks. - Example #1: +.. option:: experimental_verify=bool - iodepth_batch_complete_min=1 - iodepth_batch_complete_max= + Enable experimental verification. - which means that we will retrieve at least 1 IO and up to the - whole submitted queue depth. If none of IO has been completed - yet, we will wait. +Steady state +~~~~~~~~~~~~ - Example #2: +.. option:: steadystate=str:float, ss=str:float - iodepth_batch_complete_min=0 - iodepth_batch_complete_max= + Define the criterion and limit for assessing steady state performance. The + first parameter designates the criterion whereas the second parameter sets + the threshold. When the criterion falls below the threshold for the + specified duration, the job will stop. For example, `iops_slope:0.1%` will + direct fio to terminate the job when the least squares regression slope + falls below 0.1% of the mean IOPS. If :option:`group_reporting` is enabled + this will apply to all jobs in the group. Below is the list of available + steady state assessment criteria. All assessments are carried out using only + data from the rolling collection window. Threshold limits can be expressed + as a fixed value or as a percentage of the mean in the collection window. - which means that we can retrieve up to the whole submitted - queue depth, but if none of IO has been completed yet, we will - NOT wait and immediately exit the system call. In this example - we simply do polling. - -iodepth_low=int The low water mark indicating when to start filling - the queue again. Defaults to the same as iodepth, meaning - that fio will attempt to keep the queue full at all times. - If iodepth is set to eg 16 and iodepth_low is set to 4, then - after fio has filled the queue of 16 requests, it will let - the depth drain down to 4 before starting to fill it again. - -io_submit_mode=str This option controls how fio submits the IO to - the IO engine. The default is 'inline', which means that the - fio job threads submit and reap IO directly. If set to - 'offload', the job threads will offload IO submission to a - dedicated pool of IO threads. This requires some coordination - and thus has a bit of extra overhead, especially for lower - queue depth IO where it can increase latencies. The benefit - is that fio can manage submission rates independently of - the device completion rates. This avoids skewed latency - reporting if IO gets back up on the device side (the - coordinated omission problem). - -direct=bool If value is true, use non-buffered io. This is usually - O_DIRECT. Note that ZFS on Solaris doesn't support direct io. - On Windows the synchronous ioengines don't support direct io. - -atomic=bool If value is true, attempt to use atomic direct IO. Atomic - writes are guaranteed to be stable once acknowledged by - the operating system. Only Linux supports O_ATOMIC right - now. - -buffered=bool If value is true, use buffered io. This is the opposite - of the 'direct' option. Defaults to true. - -offset=int Start io at the given offset in the file. The data before - the given offset will not be touched. This effectively - caps the file size at real_size - offset. - -offset_increment=int If this is provided, then the real offset becomes - offset + offset_increment * thread_number, where the thread - number is a counter that starts at 0 and is incremented for - each sub-job (i.e. when numjobs option is specified). This - option is useful if there are several jobs which are intended - to operate on a file in parallel disjoint segments, with - even spacing between the starting points. - -number_ios=int Fio will normally perform IOs until it has exhausted the size - of the region set by size=, or if it exhaust the allocated - time (or hits an error condition). With this setting, the - range/size can be set independently of the number of IOs to - perform. When fio reaches this number, it will exit normally - and report status. Note that this does not extend the amount - of IO that will be done, it will only stop fio if this - condition is met before other end-of-job criteria. - -fsync=int If writing to a file, issue a sync of the dirty data - for every number of blocks given. For example, if you give - 32 as a parameter, fio will sync the file for every 32 - writes issued. If fio is using non-buffered io, we may - not sync the file. The exception is the sg io engine, which - synchronizes the disk cache anyway. - -fdatasync=int Like fsync= but uses fdatasync() to only sync data and not - metadata blocks. - In FreeBSD and Windows there is no fdatasync(), this falls back - to using fsync() - -sync_file_range=str:val Use sync_file_range() for every 'val' number of - write operations. Fio will track range of writes that - have happened since the last sync_file_range() call. 'str' - can currently be one or more of: - - wait_before SYNC_FILE_RANGE_WAIT_BEFORE - write SYNC_FILE_RANGE_WRITE - wait_after SYNC_FILE_RANGE_WAIT_AFTER - - So if you do sync_file_range=wait_before,write:8, fio would - use SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE for - every 8 writes. Also see the sync_file_range(2) man page. - This option is Linux specific. - -overwrite=bool If true, writes to a file will always overwrite existing - data. If the file doesn't already exist, it will be - created before the write phase begins. If the file exists - and is large enough for the specified write phase, nothing - will be done. - -end_fsync=bool If true, fsync file contents when a write stage has completed. - -fsync_on_close=bool If true, fio will fsync() a dirty file on close. - This differs from end_fsync in that it will happen on every - file close, not just at the end of the job. - -rwmixread=int How large a percentage of the mix should be reads. - -rwmixwrite=int How large a percentage of the mix should be writes. If both - rwmixread and rwmixwrite is given and the values do not add - up to 100%, the latter of the two will be used to override - the first. This may interfere with a given rate setting, - if fio is asked to limit reads or writes to a certain rate. - If that is the case, then the distribution may be skewed. - -random_distribution=str:float By default, fio will use a completely uniform - random distribution when asked to perform random IO. Sometimes - it is useful to skew the distribution in specific ways, - ensuring that some parts of the data is more hot than others. - fio includes the following distribution models: - - random Uniform random distribution - zipf Zipf distribution - pareto Pareto distribution - gauss Normal (gaussian) distribution - zoned Zoned random distribution - - When using a zipf or pareto distribution, an input value - is also needed to define the access pattern. For zipf, this - is the zipf theta. For pareto, it's the pareto power. Fio - includes a test program, genzipf, that can be used visualize - what the given input values will yield in terms of hit rates. - If you wanted to use zipf with a theta of 1.2, you would use - random_distribution=zipf:1.2 as the option. If a non-uniform - model is used, fio will disable use of the random map. For - the gauss distribution, a normal deviation is supplied as - a value between 0 and 100. - - For a zoned distribution, fio supports specifying percentages - of IO access that should fall within what range of the file or - device. For example, given a criteria of: - - 60% of accesses should be to the first 10% - 30% of accesses should be to the next 20% - 8% of accesses should be to to the next 30% - 2% of accesses should be to the next 40% - - we can define that through zoning of the random accesses. For - the above example, the user would do: - - random_distribution=zoned:60/10:30/20:8/30:2/40 - - similarly to how bssplit works for setting ranges and - percentages of block sizes. Like bssplit, it's possible to - specify separate zones for reads, writes, and trims. If just - one set is given, it'll apply to all of them. - -percentage_random=int For a random workload, set how big a percentage should - be random. This defaults to 100%, in which case the workload - is fully random. It can be set from anywhere from 0 to 100. - Setting it to 0 would make the workload fully sequential. Any - setting in between will result in a random mix of sequential - and random IO, at the given percentages. It is possible to - set different values for reads, writes, and trim. To do so, - simply use a comma separated list. See blocksize. - -norandommap Normally fio will cover every block of the file when doing - random IO. If this option is given, fio will just get a - new random offset without looking at past io history. This - means that some blocks may not be read or written, and that - some blocks may be read/written more than once. If this option - is used with verify= and multiple blocksizes (via bsrange=), - only intact blocks are verified, i.e., partially-overwritten - blocks are ignored. - -softrandommap=bool See norandommap. If fio runs with the random block map - enabled and it fails to allocate the map, if this option is - set it will continue without a random block map. As coverage - will not be as complete as with random maps, this option is - disabled by default. - -random_generator=str Fio supports the following engines for generating - IO offsets for random IO: - - tausworthe Strong 2^88 cycle random number generator - lfsr Linear feedback shift register generator - tausworthe64 Strong 64-bit 2^258 cycle random number - generator - - Tausworthe is a strong random number generator, but it - requires tracking on the side if we want to ensure that - blocks are only read or written once. LFSR guarantees - that we never generate the same offset twice, and it's - also less computationally expensive. It's not a true - random generator, however, though for IO purposes it's - typically good enough. LFSR only works with single - block sizes, not with workloads that use multiple block - sizes. If used with such a workload, fio may read or write - some blocks multiple times. The default value is tausworthe, - unless the required space exceeds 2^32 blocks. If it does, - then tausworthe64 is selected automatically. - -nice=int Run the job with the given nice value. See man nice(2). - - On Windows, values less than -15 set the process class to "High"; - -1 through -15 set "Above Normal"; 1 through 15 "Below Normal"; - and above 15 "Idle" priority class. - -prio=int Set the io priority value of this job. Linux limits us to - a positive value between 0 and 7, with 0 being the highest. - See man ionice(1). Refer to an appropriate manpage for - other operating systems since meaning of priority may differ. - -prioclass=int Set the io priority class. See man ionice(1). - -thinktime=int Stall the job x microseconds after an io has completed before - issuing the next. May be used to simulate processing being - done by an application. See thinktime_blocks and - thinktime_spin. - -thinktime_spin=int - Only valid if thinktime is set - pretend to spend CPU time - doing something with the data received, before falling back - to sleeping for the rest of the period specified by - thinktime. - -thinktime_blocks=int - Only valid if thinktime is set - control how many blocks - to issue, before waiting 'thinktime' usecs. If not set, - defaults to 1 which will make fio wait 'thinktime' usecs - after every block. This effectively makes any queue depth - setting redundant, since no more than 1 IO will be queued - before we have to complete it and do our thinktime. In - other words, this setting effectively caps the queue depth - if the latter is larger. - -rate=int Cap the bandwidth used by this job. The number is in bytes/sec, - the normal suffix rules apply. You can use rate=500k to limit - reads and writes to 500k each, or you can specify read and - writes separately. Using rate=1m,500k would limit reads to - 1MB/sec and writes to 500KB/sec. Capping only reads or - writes can be done with rate=,500k or rate=500k,. The former - will only limit writes (to 500KB/sec), the latter will only - limit reads. - -rate_min=int Tell fio to do whatever it can to maintain at least this - bandwidth. Failing to meet this requirement, will cause - the job to exit. The same format as rate is used for - read vs write separation. - -rate_iops=int Cap the bandwidth to this number of IOPS. Basically the same - as rate, just specified independently of bandwidth. If the - job is given a block size range instead of a fixed value, - the smallest block size is used as the metric. The same format - as rate is used for read vs write separation. - -rate_iops_min=int If fio doesn't meet this rate of IO, it will cause - the job to exit. The same format as rate is used for read vs - write separation. - -rate_process=str This option controls how fio manages rated IO - submissions. The default is 'linear', which submits IO in a - linear fashion with fixed delays between IOs that gets - adjusted based on IO completion rates. If this is set to - 'poisson', fio will submit IO based on a more real world - random request flow, known as the Poisson process - (https://en.wikipedia.org/wiki/Poisson_process). The lambda - will be 10^6 / IOPS for the given workload. - -latency_target=int If set, fio will attempt to find the max performance - point that the given workload will run at while maintaining a - latency below this target. The values is given in microseconds. - See latency_window and latency_percentile - -latency_window=int Used with latency_target to specify the sample window - that the job is run at varying queue depths to test the - performance. The value is given in microseconds. - -latency_percentile=float The percentage of IOs that must fall within the - criteria specified by latency_target and latency_window. If not - set, this defaults to 100.0, meaning that all IOs must be equal - or below to the value set by latency_target. - -max_latency=int If set, fio will exit the job if it exceeds this maximum - latency. It will exit with an ETIME error. - -rate_cycle=int Average bandwidth for 'rate' and 'rate_min' over this number - of milliseconds. - -cpumask=int Set the CPU affinity of this job. The parameter given is a - bitmask of allowed CPU's the job may run on. So if you want - the allowed CPUs to be 1 and 5, you would pass the decimal - value of (1 << 1 | 1 << 5), or 34. See man - sched_setaffinity(2). This may not work on all supported - operating systems or kernel versions. This option doesn't - work well for a higher CPU count than what you can store in - an integer mask, so it can only control cpus 1-32. For - boxes with larger CPU counts, use cpus_allowed. - -cpus_allowed=str Controls the same options as cpumask, but it allows a text - setting of the permitted CPUs instead. So to use CPUs 1 and - 5, you would specify cpus_allowed=1,5. This options also - allows a range of CPUs. Say you wanted a binding to CPUs - 1, 5, and 8-15, you would set cpus_allowed=1,5,8-15. - -cpus_allowed_policy=str Set the policy of how fio distributes the CPUs - specified by cpus_allowed or cpumask. Two policies are - supported: - - shared All jobs will share the CPU set specified. - split Each job will get a unique CPU from the CPU set. - - 'shared' is the default behaviour, if the option isn't - specified. If split is specified, then fio will will assign - one cpu per job. If not enough CPUs are given for the jobs - listed, then fio will roundrobin the CPUs in the set. - -numa_cpu_nodes=str Set this job running on specified NUMA nodes' CPUs. The - arguments allow comma delimited list of cpu numbers, - A-B ranges, or 'all'. Note, to enable numa options support, - fio must be built on a system with libnuma-dev(el) installed. - -numa_mem_policy=str Set this job's memory policy and corresponding NUMA - nodes. Format of the arguments: - [:] - `mode' is one of the following memory policy: - default, prefer, bind, interleave, local - For `default' and `local' memory policy, no node is - needed to be specified. - For `prefer', only one node is allowed. - For `bind' and `interleave', it allow comma delimited - list of numbers, A-B ranges, or 'all'. - -startdelay=time Start this job the specified number of seconds after fio - has started. Only useful if the job file contains several - jobs, and you want to delay starting some jobs to a certain - time. - -runtime=time Tell fio to terminate processing after the specified number - of seconds. It can be quite hard to determine for how long - a specified job will run, so this parameter is handy to - cap the total runtime to a given time. - -time_based If set, fio will run for the duration of the runtime - specified even if the file(s) are completely read or - written. It will simply loop over the same workload - as many times as the runtime allows. - -ramp_time=time If set, fio will run the specified workload for this amount - of time before logging any performance numbers. Useful for - letting performance settle before logging results, thus - minimizing the runtime required for stable results. Note - that the ramp_time is considered lead in time for a job, - thus it will increase the total runtime if a special timeout - or runtime is specified. - -steadystate=str:float -ss=str:float Define the criterion and limit for assessing steady state - performance. The first parameter designates the criterion - whereas the second parameter sets the threshold. When the - criterion falls below the threshold for the specified duration, - the job will stop. For example, iops_slope:0.1% will direct fio - to terminate the job when the least squares regression slope - falls below 0.1% of the mean IOPS. If group_reporting is - enabled this will apply to all jobs in the group. Below is the - list of available steady state assessment criteria. All - assessments are carried out using only data from the rolling - collection window. Threshold limits can be expressed as a fixed - value or as a percentage of the mean in the collection window. - iops Collect IOPS data. Stop the job if all - individual IOPS measurements are within the - specified limit of the mean IOPS (e.g., iops:2 - means that all individual IOPS values must be - within 2 of the mean, whereas iops:0.2% means - that all individual IOPS values must be within - 0.2% of the mean IOPS to terminate the job). - iops_slope - Collect IOPS data and calculate the least - squares regression slope. Stop the job if the - slope falls below the specified limit. - bw Collect bandwidth data. Stop the job if all - individual bandwidth measurements are within - the specified limit of the mean bandwidth. - bw_slope - Collect bandwidth data and calculate the least - squares regression slope. Stop the job if the - slope falls below the specified limit. - -steadystate_duration=time -ss_dur=time A rolling window of this duration will be used to judge whether - steady state has been reached. Data will be collected once per - second. The default is 0 which disables steady state detection. - -steadystate_ramp_time=time -ss_ramp=time Allow the job to run for the specified duration before - beginning data collection for checking the steady state job - termination criterion. The default is 0. - -invalidate=bool Invalidate the buffer/page cache parts for this file prior - to starting io. Defaults to true. - -sync=bool Use sync io for buffered writes. For the majority of the - io engines, this means using O_SYNC. - -iomem=str -mem=str Fio can use various types of memory as the io unit buffer. - The allowed values are: - - malloc Use memory from malloc(3) as the buffers. - Default memory type. - - shm Use shared memory as the buffers. Allocated - through shmget(2). - - shmhuge Same as shm, but use huge pages as backing. - - mmap Use mmap to allocate buffers. May either be - anonymous memory, or can be file backed if - a filename is given after the option. The - format is mem=mmap:/path/to/file. - - mmaphuge Use a memory mapped huge file as the buffer - backing. Append filename after mmaphuge, ala - mem=mmaphuge:/hugetlbfs/file - - mmapshared Same as mmap, but use a MMAP_SHARED - mapping. - - The area allocated is a function of the maximum allowed - bs size for the job, multiplied by the io depth given. Note - that for shmhuge and mmaphuge to work, the system must have - free huge pages allocated. This can normally be checked - and set by reading/writing /proc/sys/vm/nr_hugepages on a - Linux system. Fio assumes a huge page is 4MB in size. So - to calculate the number of huge pages you need for a given - job file, add up the io depth of all jobs (normally one unless - iodepth= is used) and multiply by the maximum bs set. Then - divide that number by the huge page size. You can see the - size of the huge pages in /proc/meminfo. If no huge pages - are allocated by having a non-zero number in nr_hugepages, - using mmaphuge or shmhuge will fail. Also see hugepage-size. - - mmaphuge also needs to have hugetlbfs mounted and the file - location should point there. So if it's mounted in /huge, - you would use mem=mmaphuge:/huge/somefile. - -iomem_align=int This indicates the memory alignment of the IO memory buffers. - Note that the given alignment is applied to the first IO unit - buffer, if using iodepth the alignment of the following buffers - are given by the bs used. In other words, if using a bs that is - a multiple of the page sized in the system, all buffers will - be aligned to this value. If using a bs that is not page - aligned, the alignment of subsequent IO memory buffers is the - sum of the iomem_align and bs used. - -hugepage-size=int - Defines the size of a huge page. Must at least be equal - to the system setting, see /proc/meminfo. Defaults to 4MB. - Should probably always be a multiple of megabytes, so using - hugepage-size=Xm is the preferred way to set this to avoid - setting a non-pow-2 bad value. - -exitall When one job finishes, terminate the rest. The default is - to wait for each job to finish, sometimes that is not the - desired action. - -exitall_on_error When one job finishes in error, terminate the rest. The - default is to wait for each job to finish. - -bwavgtime=int Average the calculated bandwidth over the given time. Value - is specified in milliseconds. If the job also does bandwidth - logging through 'write_bw_log', then the minimum of this option - and 'log_avg_msec' will be used. Default: 500ms. - -iopsavgtime=int Average the calculated IOPS over the given time. Value - is specified in milliseconds. If the job also does IOPS logging - through 'write_iops_log', then the minimum of this option and - 'log_avg_msec' will be used. Default: 500ms. - -create_serialize=bool If true, serialize the file creation for the jobs. - This may be handy to avoid interleaving of data - files, which may greatly depend on the filesystem - used and even the number of processors in the system. - -create_fsync=bool fsync the data file after creation. This is the - default. - -create_on_open=bool Don't pre-setup the files for IO, just create open() - when it's time to do IO to that file. - -create_only=bool If true, fio will only run the setup phase of the job. - If files need to be laid out or updated on disk, only - that will be done. The actual job contents are not - executed. - -allow_file_create=bool If true, fio is permitted to create files as part - of its workload. This is the default behavior. If this - option is false, then fio will error out if the files it - needs to use don't already exist. Default: true. - -allow_mounted_write=bool If this isn't set, fio will abort jobs that - are destructive (eg that write) to what appears to be a - mounted device or partition. This should help catch creating - inadvertently destructive tests, not realizing that the test - will destroy data on the mounted file system. Default: false. - -pre_read=bool If this is given, files will be pre-read into memory before - starting the given IO operation. This will also clear - the 'invalidate' flag, since it is pointless to pre-read - and then drop the cache. This will only work for IO engines - that are seek-able, since they allow you to read the same data - multiple times. Thus it will not work on eg network or splice - IO. - -unlink=bool Unlink the job files when done. Not the default, as repeated - runs of that job would then waste time recreating the file - set again and again. - -unlink_each_loop=bool Unlink job files after each iteration or loop. - -loops=int Run the specified number of iterations of this job. Used - to repeat the same workload a given number of times. Defaults - to 1. - -verify_only Do not perform specified workload---only verify data still - matches previous invocation of this workload. This option - allows one to check data multiple times at a later date - without overwriting it. This option makes sense only for - workloads that write data, and does not support workloads - with the time_based option set. - -do_verify=bool Run the verify phase after a write phase. Only makes sense if - verify is set. Defaults to 1. - -verify=str If writing to a file, fio can verify the file contents - after each iteration of the job. Each verification method also implies - verification of special header, which is written to the beginning of - each block. This header also includes meta information, like offset - of the block, block number, timestamp when block was written, etc. - verify=str can be combined with verify_pattern=str option. - The allowed values are: - - md5 Use an md5 sum of the data area and store - it in the header of each block. - - crc64 Use an experimental crc64 sum of the data - area and store it in the header of each - block. - - crc32c Use a crc32c sum of the data area and store - it in the header of each block. - - crc32c-intel Use hardware assisted crc32c calculation - provided on SSE4.2 enabled processors. Falls - back to regular software crc32c, if not - supported by the system. - - crc32 Use a crc32 sum of the data area and store - it in the header of each block. - - crc16 Use a crc16 sum of the data area and store - it in the header of each block. - - crc7 Use a crc7 sum of the data area and store - it in the header of each block. - - xxhash Use xxhash as the checksum function. Generally - the fastest software checksum that fio - supports. - - sha512 Use sha512 as the checksum function. - - sha256 Use sha256 as the checksum function. - - sha1 Use optimized sha1 as the checksum function. - - meta This option is deprecated, since now meta information is - included in generic verification header and meta verification - happens by default. For detailed information see the description - of the verify=str setting. This option is kept because of - compatibility's sake with old configurations. Do not use it. - - pattern Verify a strict pattern. Normally fio includes - a header with some basic information and - checksumming, but if this option is set, only - the specific pattern set with 'verify_pattern' - is verified. - - null Only pretend to verify. Useful for testing - internals with ioengine=null, not for much - else. - - This option can be used for repeated burn-in tests of a - system to make sure that the written data is also - correctly read back. If the data direction given is - a read or random read, fio will assume that it should - verify a previously written file. If the data direction - includes any form of write, the verify will be of the - newly written data. - -verifysort=bool If set, fio will sort written verify blocks when it deems - it faster to read them back in a sorted manner. This is - often the case when overwriting an existing file, since - the blocks are already laid out in the file system. You - can ignore this option unless doing huge amounts of really - fast IO where the red-black tree sorting CPU time becomes - significant. - -verify_offset=int Swap the verification header with data somewhere else - in the block before writing. Its swapped back before - verifying. - -verify_interval=int Write the verification header at a finer granularity - than the blocksize. It will be written for chunks the - size of header_interval. blocksize should divide this - evenly. - -verify_pattern=str If set, fio will fill the io buffers with this - pattern. Fio defaults to filling with totally random - bytes, but sometimes it's interesting to fill with a known - pattern for io verification purposes. Depending on the - width of the pattern, fio will fill 1/2/3/4 bytes of the - buffer at the time(it can be either a decimal or a hex number). - The verify_pattern if larger than a 32-bit quantity has to - be a hex number that starts with either "0x" or "0X". Use - with verify=str. Also, verify_pattern supports %o format, - which means that for each block offset will be written and - then verified back, e.g.: + **iops** + Collect IOPS data. Stop the job if all individual IOPS measurements + are within the specified limit of the mean IOPS (e.g., ``iops:2`` + means that all individual IOPS values must be within 2 of the mean, + whereas ``iops:0.2%`` means that all individual IOPS values must be + within 0.2% of the mean IOPS to terminate the job). - verify_pattern=%o + **iops_slope** + Collect IOPS data and calculate the least squares regression + slope. Stop the job if the slope falls below the specified limit. - Or use combination of everything: - verify_pattern=0xff%o"abcd"-12 + **bw** + Collect bandwidth data. Stop the job if all individual bandwidth + measurements are within the specified limit of the mean bandwidth. + + **bw_slope** + Collect bandwidth data and calculate the least squares regression + slope. Stop the job if the slope falls below the specified limit. + +.. option:: steadystate_duration=time, ss_dur=time -verify_fatal=bool Normally fio will keep checking the entire contents - before quitting on a block verification failure. If this - option is set, fio will exit the job on the first observed - failure. - -verify_dump=bool If set, dump the contents of both the original data - block and the data block we read off disk to files. This - allows later analysis to inspect just what kind of data - corruption occurred. Off by default. - -verify_async=int Fio will normally verify IO inline from the submitting - thread. This option takes an integer describing how many - async offload threads to create for IO verification instead, - causing fio to offload the duty of verifying IO contents - to one or more separate threads. If using this offload - option, even sync IO engines can benefit from using an - iodepth setting higher than 1, as it allows them to have - IO in flight while verifies are running. - -verify_async_cpus=str Tell fio to set the given CPU affinity on the - async IO verification threads. See cpus_allowed for the - format used. - -verify_backlog=int Fio will normally verify the written contents of a - job that utilizes verify once that job has completed. In - other words, everything is written then everything is read - back and verified. You may want to verify continually - instead for a variety of reasons. Fio stores the meta data - associated with an IO block in memory, so for large - verify workloads, quite a bit of memory would be used up - holding this meta data. If this option is enabled, fio - will write only N blocks before verifying these blocks. - -verify_backlog_batch=int Control how many blocks fio will verify - if verify_backlog is set. If not set, will default to - the value of verify_backlog (meaning the entire queue - is read back and verified). If verify_backlog_batch is - less than verify_backlog then not all blocks will be verified, - if verify_backlog_batch is larger than verify_backlog, some - blocks will be verified more than once. - -verify_state_save=bool When a job exits during the write phase of a verify - workload, save its current state. This allows fio to replay - up until that point, if the verify state is loaded for the - verify read phase. The format of the filename is, roughly, - ---verify.state. is "local" - for a local run, "sock" for a client/server socket connection, - and "ip" (192.168.0.1, for instance) for a networked - client/server connection. - -verify_state_load=bool If a verify termination trigger was used, fio stores - the current write state of each thread. This can be used at - verification time so that fio knows how far it should verify. - Without this information, fio will run a full verification - pass, according to the settings in the job file used. - -stonewall -wait_for_previous Wait for preceding jobs in the job file to exit, before - starting this one. Can be used to insert serialization - points in the job file. A stone wall also implies starting - a new reporting group. - -new_group Start a new reporting group. See: group_reporting. - -numjobs=int Create the specified number of clones of this job. May be - used to setup a larger number of threads/processes doing - the same thing. Each thread is reported separately; to see - statistics for all clones as a whole, use group_reporting in - conjunction with new_group. - -group_reporting It may sometimes be interesting to display statistics for - groups of jobs as a whole instead of for each individual job. - This is especially true if 'numjobs' is used; looking at - individual thread/process output quickly becomes unwieldy. - To see the final report per-group instead of per-job, use - 'group_reporting'. Jobs in a file will be part of the same - reporting group, unless if separated by a stonewall, or by - using 'new_group'. - -thread fio defaults to forking jobs, however if this option is - given, fio will use pthread_create(3) to create threads - instead. - -zonesize=int Divide a file into zones of the specified size. See zoneskip. - -zoneskip=int Skip the specified number of bytes when zonesize data has - been read. The two zone options can be used to only do - io on zones of a file. - -write_iolog=str Write the issued io patterns to the specified file. See - read_iolog. Specify a separate file for each job, otherwise - the iologs will be interspersed and the file may be corrupt. - -read_iolog=str Open an iolog with the specified file name and replay the - io patterns it contains. This can be used to store a - workload and replay it sometime later. The iolog given - may also be a blktrace binary file, which allows fio - to replay a workload captured by blktrace. See blktrace - for how to capture such logging data. For blktrace replay, - the file needs to be turned into a blkparse binary data - file first (blkparse -o /dev/null -d file_for_fio.bin). - -replay_no_stall=int When replaying I/O with read_iolog the default behavior - is to attempt to respect the time stamps within the log and - replay them with the appropriate delay between IOPS. By - setting this variable fio will not respect the timestamps and - attempt to replay them as fast as possible while still - respecting ordering. The result is the same I/O pattern to a - given device, but different timings. - -replay_redirect=str While replaying I/O patterns using read_iolog the - default behavior is to replay the IOPS onto the major/minor - device that each IOP was recorded from. This is sometimes - undesirable because on a different machine those major/minor - numbers can map to a different device. Changing hardware on - the same system can also result in a different major/minor - mapping. Replay_redirect causes all IOPS to be replayed onto - the single specified device regardless of the device it was - recorded from. i.e. replay_redirect=/dev/sdc would cause all - IO in the blktrace or iolog to be replayed onto /dev/sdc. - This means multiple devices will be replayed onto a single - device, if the trace contains multiple devices. If you want - multiple devices to be replayed concurrently to multiple - redirected devices you must blkparse your trace into separate - traces and replay them with independent fio invocations. - Unfortunately this also breaks the strict time ordering - between multiple device accesses. - -replay_align=int Force alignment of IO offsets and lengths in a trace - to this power of 2 value. - -replay_scale=int Scale sector offsets down by this factor when - replaying traces. - -per_job_logs=bool If set, this generates bw/clat/iops log with per - file private filenames. If not set, jobs with identical names - will share the log filename. Default: true. - -write_bw_log=str If given, write a bandwidth log of the jobs in this job - file. Can be used to store data of the bandwidth of the - jobs in their lifetime. The included fio_generate_plots - script uses gnuplot to turn these text files into nice - graphs. See write_lat_log for behaviour of given - filename. For this option, the suffix is _bw.x.log, where - x is the index of the job (1..N, where N is the number of - jobs). If 'per_job_logs' is false, then the filename will not - include the job index. See 'Log File Formats'. - -write_lat_log=str Same as write_bw_log, except that this option stores io - submission, completion, and total latencies instead. If no - filename is given with this option, the default filename of - "jobname_type.log" is used. Even if the filename is given, - fio will still append the type of log. So if one specifies + A rolling window of this duration will be used to judge whether steady state + has been reached. Data will be collected once per second. The default is 0 + which disables steady state detection. When the unit is omitted, the + value is interpreted in seconds. + +.. option:: steadystate_ramp_time=time, ss_ramp=time + + Allow the job to run for the specified duration before beginning data + collection for checking the steady state job termination criterion. The + default is 0. When the unit is omitted, the value is interpreted in seconds. + + +Measurements and reporting +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. option:: per_job_logs=bool + + If set, this generates bw/clat/iops log with per file private filenames. If + not set, jobs with identical names will share the log filename. Default: + true. + +.. option:: group_reporting + + It may sometimes be interesting to display statistics for groups of jobs as + a whole instead of for each individual job. This is especially true if + :option:`numjobs` is used; looking at individual thread/process output + quickly becomes unwieldy. To see the final report per-group instead of + per-job, use :option:`group_reporting`. Jobs in a file will be part of the + same reporting group, unless if separated by a :option:`stonewall`, or by + using :option:`new_group`. + +.. option:: new_group + + Start a new reporting group. See: :option:`group_reporting`. If not given, + all jobs in a file will be part of the same reporting group, unless + separated by a :option:`stonewall`. + +.. option:: stats=bool + + By default, fio collects and shows final output results for all jobs + that run. If this option is set to 0, then fio will ignore it in + the final stat output. + +.. option:: write_bw_log=str + + If given, write a bandwidth log for this job. Can be used to store data of + the bandwidth of the jobs in their lifetime. The included + :command:`fio_generate_plots` script uses :command:`gnuplot` to turn these + text files into nice graphs. See :option:`write_lat_log` for behavior of + given filename. For this option, the postfix is :file:`_bw.x.log`, where `x` + is the index of the job (`1..N`, where `N` is the number of jobs). If + :option:`per_job_logs` is false, then the filename will not include the job + index. See `Log File Formats`_. + +.. option:: write_lat_log=str + + Same as :option:`write_bw_log`, except that this option stores I/O + submission, completion, and total latencies instead. If no filename is given + with this option, the default filename of :file:`jobname_type.log` is + used. Even if the filename is given, fio will still append the type of + log. So if one specifies:: write_lat_log=foo - The actual log names will be foo_slat.x.log, foo_clat.x.log, - and foo_lat.x.log, where x is the index of the job (1..N, - where N is the number of jobs). This helps fio_generate_plot - find the logs automatically. If 'per_job_logs' is false, then - the filename will not include the job index. See 'Log File - Formats'. - -write_hist_log=str Same as write_lat_log, but writes I/O completion - latency histograms. If no filename is given with this option, the - default filename of "jobname_clat_hist.x.log" is used, where x is - the index of the job (1..N, where N is the number of jobs). Even - if the filename is given, fio will still append the type of log. - If per_job_logs is false, then the filename will not include the - job index. See 'Log File Formats'. - -write_iops_log=str Same as write_bw_log, but writes IOPS. If no filename is - given with this option, the default filename of - "jobname_type.x.log" is used,where x is the index of the job - (1..N, where N is the number of jobs). Even if the filename - is given, fio will still append the type of log. If - 'per_job_logs' is false, then the filename will not include - the job index. See 'Log File Formats'. - -log_avg_msec=int By default, fio will log an entry in the iops, latency, - or bw log for every IO that completes. When writing to the - disk log, that can quickly grow to a very large size. Setting - this option makes fio average the each log entry over the - specified period of time, reducing the resolution of the log. - See log_max_value as well. Defaults to 0, logging all entries. - -log_hist_msec=int Same as log_avg_msec, but logs entries for completion - latency histograms. Computing latency percentiles from averages of - intervals using log_avg_msec is innacurate. Setting this option makes - fio log histogram entries over the specified period of time, reducing - log sizes for high IOPS devices while retaining percentile accuracy. - See log_hist_coarseness as well. Defaults to 0, meaning histogram - logging is disabled. - -log_hist_coarseness=int Integer ranging from 0 to 6, defining the coarseness - of the resolution of the histogram logs enabled with log_hist_msec. For - each increment in coarseness, fio outputs half as many bins. Defaults to - 0, for which histogram logs contain 1216 latency bins. See - 'Log File Formats'. - -log_max_value=bool If log_avg_msec is set, fio logs the average over that - window. If you instead want to log the maximum value, set this - option to 1. Defaults to 0, meaning that averaged values are - logged. - -log_offset=int If this is set, the iolog options will include the byte - offset for the IO entry as well as the other data values. - -log_compression=int If this is set, fio will compress the IO logs as - it goes, to keep the memory footprint lower. When a log - reaches the specified size, that chunk is removed and - compressed in the background. Given that IO logs are - fairly highly compressible, this yields a nice memory - savings for longer runs. The downside is that the - compression will consume some background CPU cycles, so - it may impact the run. This, however, is also true if - the logging ends up consuming most of the system memory. - So pick your poison. The IO logs are saved normally at the - end of a run, by decompressing the chunks and storing them - in the specified log file. This feature depends on the - availability of zlib. - -log_compression_cpus=str Define the set of CPUs that are allowed to - handle online log compression for the IO jobs. This can - provide better isolation between performance sensitive jobs, - and background compression work. - -log_store_compressed=bool If set, fio will store the log files in a - compressed format. They can be decompressed with fio, using - the --inflate-log command line parameter. The files will be - stored with a .fz suffix. - -log_unix_epoch=bool If set, fio will log Unix timestamps to the log - files produced by enabling write_type_log for each log type, instead - of the default zero-based timestamps. - -block_error_percentiles=bool If set, record errors in trim block-sized - units from writes and trims and output a histogram of - how many trims it took to get to errors, and what kind - of error was encountered. - -lockmem=int Pin down the specified amount of memory with mlock(2). Can - potentially be used instead of removing memory or booting - with less memory to simulate a smaller amount of memory. - The amount specified is per worker. - -exec_prerun=str Before running this job, issue the command specified - through system(3). Output is redirected in a file called - jobname.prerun.txt. - -exec_postrun=str After the job completes, issue the command specified - though system(3). Output is redirected in a file called - jobname.postrun.txt. - -ioscheduler=str Attempt to switch the device hosting the file to the specified - io scheduler before running. - -disk_util=bool Generate disk utilization statistics, if the platform - supports it. Defaults to on. - -disable_lat=bool Disable measurements of total latency numbers. Useful - only for cutting back the number of calls to gettimeofday, - as that does impact performance at really high IOPS rates. - Note that to really get rid of a large amount of these - calls, this option must be used with disable_slat and - disable_bw as well. - -disable_clat=bool Disable measurements of completion latency numbers. See - disable_lat. - -disable_slat=bool Disable measurements of submission latency numbers. See - disable_slat. - -disable_bw=bool Disable measurements of throughput/bandwidth numbers. See - disable_lat. - -clat_percentiles=bool Enable the reporting of percentiles of - completion latencies. - -percentile_list=float_list Overwrite the default list of percentiles - for completion latencies and the block error histogram. - Each number is a floating number in the range (0,100], - and the maximum length of the list is 20. Use ':' - to separate the numbers, and list the numbers in ascending - order. For example, --percentile_list=99.5:99.9 will cause - fio to report the values of completion latency below which - 99.5% and 99.9% of the observed latencies fell, respectively. - -clocksource=str Use the given clocksource as the base of timing. The - supported options are: - - gettimeofday gettimeofday(2) - - clock_gettime clock_gettime(2) - - cpu Internal CPU clock source - - cpu is the preferred clocksource if it is reliable, as it - is very fast (and fio is heavy on time calls). Fio will - automatically use this clocksource if it's supported and - considered reliable on the system it is running on, unless - another clocksource is specifically set. For x86/x86-64 CPUs, - this means supporting TSC Invariant. - -gtod_reduce=bool Enable all of the gettimeofday() reducing options - (disable_clat, disable_slat, disable_bw) plus reduce - precision of the timeout somewhat to really shrink - the gettimeofday() call count. With this option enabled, - we only do about 0.4% of the gtod() calls we would have - done if all time keeping was enabled. - -gtod_cpu=int Sometimes it's cheaper to dedicate a single thread of - execution to just getting the current time. Fio (and - databases, for instance) are very intensive on gettimeofday() - calls. With this option, you can set one CPU aside for - doing nothing but logging current time to a shared memory - location. Then the other threads/processes that run IO - workloads need only copy that segment, instead of entering - the kernel with a gettimeofday() call. The CPU set aside - for doing these time calls will be excluded from other - uses. Fio will manually clear it from the CPU mask of other - jobs. - -continue_on_error=str Normally fio will exit the job on the first observed - failure. If this option is set, fio will continue the job when - there is a 'non-fatal error' (EIO or EILSEQ) until the runtime - is exceeded or the I/O size specified is completed. If this - option is used, there are two more stats that are appended, - the total error count and the first error. The error field - given in the stats is the first error that was hit during the - run. - - The allowed values are: - - none Exit on any IO or verify errors. - - read Continue on read errors, exit on all others. - - write Continue on write errors, exit on all others. - - io Continue on any IO error, exit on all others. - - verify Continue on verify errors, exit on all others. - - all Continue on all errors. - - 0 Backward-compatible alias for 'none'. - - 1 Backward-compatible alias for 'all'. - -ignore_error=str Sometimes you want to ignore some errors during test - in that case you can specify error list for each error type. - ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST - errors for given error type is separated with ':'. Error - may be symbol ('ENOSPC', 'ENOMEM') or integer. - Example: - ignore_error=EAGAIN,ENOSPC:122 - This option will ignore EAGAIN from READ, and ENOSPC and - 122(EDQUOT) from WRITE. - -error_dump=bool If set dump every error even if it is non fatal, true - by default. If disabled only fatal error will be dumped - -cgroup=str Add job to this control group. If it doesn't exist, it will - be created. The system must have a mounted cgroup blkio - mount point for this to work. If your system doesn't have it - mounted, you can do so with: + The actual log names will be :file:`foo_slat.x.log`, :file:`foo_clat.x.log`, + and :file:`foo_lat.x.log`, where `x` is the index of the job (`1..N`, where `N` + is the number of jobs). This helps :command:`fio_generate_plots` find the + logs automatically. If :option:`per_job_logs` is false, then the filename + will not include the job index. See `Log File Formats`_. - # mount -t cgroup -o blkio none /cgroup +.. option:: write_hist_log=str -cgroup_weight=int Set the weight of the cgroup to this value. See - the documentation that comes with the kernel, allowed values - are in the range of 100..1000. - -cgroup_nodelete=bool Normally fio will delete the cgroups it has created after - the job completion. To override this behavior and to leave - cgroups around after the job completion, set cgroup_nodelete=1. - This can be useful if one wants to inspect various cgroup - files after job completion. Default: false - -uid=int Instead of running as the invoking user, set the user ID to - this value before the thread/process does any work. - -gid=int Set group ID, see uid. - -flow_id=int The ID of the flow. If not specified, it defaults to being a - global flow. See flow. - -flow=int Weight in token-based flow control. If this value is used, then - there is a 'flow counter' which is used to regulate the - proportion of activity between two or more jobs. fio attempts - to keep this flow counter near zero. The 'flow' parameter - stands for how much should be added or subtracted to the flow - counter on each iteration of the main I/O loop. That is, if - one job has flow=8 and another job has flow=-1, then there - will be a roughly 1:8 ratio in how much one runs vs the other. - -flow_watermark=int The maximum value that the absolute value of the flow - counter is allowed to reach before the job must wait for a - lower value of the counter. + Same as :option:`write_lat_log`, but writes I/O completion latency + histograms. If no filename is given with this option, the default filename + of :file:`jobname_clat_hist.x.log` is used, where `x` is the index of the + job (`1..N`, where `N` is the number of jobs). Even if the filename is given, + fio will still append the type of log. If :option:`per_job_logs` is false, + then the filename will not include the job index. See `Log File Formats`_. -flow_sleep=int The period of time, in microseconds, to wait after the flow - watermark has been exceeded before retrying operations +.. option:: write_iops_log=str -In addition, there are some parameters which are only valid when a specific -ioengine is in use. These are used identically to normal parameters, with the -caveat that when used on the command line, they must come after the ioengine -that defines them is selected. - -[libaio] userspace_reap Normally, with the libaio engine in use, fio will use - the io_getevents system call to reap newly returned events. - With this flag turned on, the AIO ring will be read directly - from user-space to reap events. The reaping mode is only - enabled when polling for a minimum of 0 events (eg when - iodepth_batch_complete=0). - -[psyncv2] hipri Set RWF_HIPRI on IO, indicating to the kernel that - it's of higher priority than normal. - -[cpuio] cpuload=int Attempt to use the specified percentage of CPU cycles. - -[cpuio] cpuchunks=int Split the load into cycles of the given time. In - microseconds. - -[cpuio] exit_on_io_done=bool Detect when IO threads are done, then exit. - -[netsplice] hostname=str -[net] hostname=str The host name or IP address to use for TCP or UDP based IO. - If the job is a TCP listener or UDP reader, the hostname is not - used and must be omitted unless it is a valid UDP multicast - address. -[libhdfs] namenode=str The host name or IP address of a HDFS cluster namenode to contact. - -[netsplice] port=int -[net] port=int The TCP or UDP port to bind to or connect to. If this is used -with numjobs to spawn multiple instances of the same job type, then this will -be the starting port number since fio will use a range of ports. -[libhdfs] port=int the listening port of the HFDS cluster namenode. - -[netsplice] interface=str -[net] interface=str The IP address of the network interface used to send or - receive UDP multicast - -[netsplice] ttl=int -[net] ttl=int Time-to-live value for outgoing UDP multicast packets. - Default: 1 - -[netsplice] nodelay=bool -[net] nodelay=bool Set TCP_NODELAY on TCP connections. - -[netsplice] protocol=str -[netsplice] proto=str -[net] protocol=str -[net] proto=str The network protocol to use. Accepted values are: - - tcp Transmission control protocol - tcpv6 Transmission control protocol V6 - udp User datagram protocol - udpv6 User datagram protocol V6 - unix UNIX domain socket - - When the protocol is TCP or UDP, the port must also be given, - as well as the hostname if the job is a TCP listener or UDP - reader. For unix sockets, the normal filename option should be - used and the port is invalid. - -[net] listen For TCP network connections, tell fio to listen for incoming - connections rather than initiating an outgoing connection. The - hostname must be omitted if this option is used. - -[net] pingpong Normally a network writer will just continue writing data, and - a network reader will just consume packages. If pingpong=1 - is set, a writer will send its normal payload to the reader, - then wait for the reader to send the same payload back. This - allows fio to measure network latencies. The submission - and completion latencies then measure local time spent - sending or receiving, and the completion latency measures - how long it took for the other end to receive and send back. - For UDP multicast traffic pingpong=1 should only be set for a - single reader when multiple readers are listening to the same - address. - -[net] window_size Set the desired socket buffer size for the connection. - -[net] mss Set the TCP maximum segment size (TCP_MAXSEG). - -[e4defrag] donorname=str - File will be used as a block donor(swap extents between files) -[e4defrag] inplace=int - Configure donor file blocks allocation strategy - 0(default): Preallocate donor's file on init - 1 : allocate space immediately inside defragment event, - and free right after event - -[rbd] clustername=str Specifies the name of the Ceph cluster. -[rbd] rbdname=str Specifies the name of the RBD. -[rbd] pool=str Specifies the name of the Ceph pool containing RBD. -[rbd] clientname=str Specifies the username (without the 'client.' prefix) - used to access the Ceph cluster. If the clustername is - specified, the clientname shall be the full type.id - string. If no type. prefix is given, fio will add - 'client.' by default. - -[mtd] skip_bad=bool Skip operations against known bad blocks. - -[libhdfs] hdfsdirectory libhdfs will create chunk in this HDFS directory -[libhdfs] chunk_size the size of the chunk to use for each file. - - -6.0 Interpreting the output ---------------------------- - -fio spits out a lot of output. While running, fio will display the -status of the jobs created. An example of that would be: - -Threads: 1: [_r] [24.8% done] [ 13509/ 8334 kb/s] [eta 00h:01m:31s] - -The characters inside the square brackets denote the current status of -each thread. The possible values (in typical life cycle order) are: - -Idle Run ----- --- -P Thread setup, but not started. -C Thread created. -I Thread initialized, waiting or generating necessary data. - p Thread running pre-reading file(s). - R Running, doing sequential reads. - r Running, doing random reads. - W Running, doing sequential writes. - w Running, doing random writes. - M Running, doing mixed sequential reads/writes. - m Running, doing mixed random reads/writes. - F Running, currently waiting for fsync() - f Running, finishing up (writing IO logs, etc) - V Running, doing verification of written data. -E Thread exited, not reaped by main thread yet. -_ Thread reaped, or -X Thread reaped, exited with an error. -K Thread reaped, exited due to signal. - -Fio will condense the thread string as not to take up more space on the -command line as is needed. For instance, if you have 10 readers and 10 -writers running, the output would look like this: - -Jobs: 20 (f=20): [R(10),W(10)] [4.0% done] [2103MB/0KB/0KB /s] [538K/0/0 iops] [eta 57m:36s] - -Fio will still maintain the ordering, though. So the above means that jobs -1..10 are readers, and 11..20 are writers. - -The other values are fairly self explanatory - number of threads -currently running and doing io, rate of io since last check (read speed -listed first, then write speed), and the estimated completion percentage -and time for the running group. It's impossible to estimate runtime of -the following groups (if any). Note that the string is displayed in order, -so it's possible to tell which of the jobs are currently doing what. The -first character is the first job defined in the job file, and so forth. - -When fio is done (or interrupted by ctrl-c), it will show the data for -each thread, group of threads, and disks in that order. For each data -direction, the output looks like: - -Client1 (g=0): err= 0: - write: io= 32MB, bw= 666KB/s, iops=89 , runt= 50320msec - slat (msec): min= 0, max= 136, avg= 0.03, stdev= 1.92 - clat (msec): min= 0, max= 631, avg=48.50, stdev=86.82 - bw (KB/s) : min= 0, max= 1196, per=51.00%, avg=664.02, stdev=681.68 - cpu : usr=1.49%, sys=0.25%, ctx=7969, majf=0, minf=17 - IO depths : 1=0.1%, 2=0.3%, 4=0.5%, 8=99.0%, 16=0.0%, 32=0.0%, >32=0.0% - submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% - complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% - issued r/w: total=0/32768, short=0/0 - lat (msec): 2=1.6%, 4=0.0%, 10=3.2%, 20=12.8%, 50=38.4%, 100=24.8%, - lat (msec): 250=15.2%, 500=0.0%, 750=0.0%, 1000=0.0%, >=2048=0.0% - -The client number is printed, along with the group id and error of that -thread. Below is the io statistics, here for writes. In the order listed, -they denote: - -io= Number of megabytes io performed -bw= Average bandwidth rate -iops= Average IOs performed per second -runt= The runtime of that thread - slat= Submission latency (avg being the average, stdev being the - standard deviation). This is the time it took to submit - the io. For sync io, the slat is really the completion - latency, since queue/complete is one operation there. This - value can be in milliseconds or microseconds, fio will choose - the most appropriate base and print that. In the example - above, milliseconds is the best scale. Note: in --minimal mode + Same as :option:`write_bw_log`, but writes IOPS. If no filename is given + with this option, the default filename of :file:`jobname_type.x.log` is + used, where `x` is the index of the job (`1..N`, where `N` is the number of + jobs). Even if the filename is given, fio will still append the type of + log. If :option:`per_job_logs` is false, then the filename will not include + the job index. See `Log File Formats`_. + +.. option:: log_avg_msec=int + + By default, fio will log an entry in the iops, latency, or bw log for every + I/O that completes. When writing to the disk log, that can quickly grow to a + very large size. Setting this option makes fio average the each log entry + over the specified period of time, reducing the resolution of the log. See + :option:`log_max_value` as well. Defaults to 0, logging all entries. + Also see `Log File Formats`_. + +.. option:: log_hist_msec=int + + Same as :option:`log_avg_msec`, but logs entries for completion latency + histograms. Computing latency percentiles from averages of intervals using + :option:`log_avg_msec` is inaccurate. Setting this option makes fio log + histogram entries over the specified period of time, reducing log sizes for + high IOPS devices while retaining percentile accuracy. See + :option:`log_hist_coarseness` as well. Defaults to 0, meaning histogram + logging is disabled. + +.. option:: log_hist_coarseness=int + + Integer ranging from 0 to 6, defining the coarseness of the resolution of + the histogram logs enabled with :option:`log_hist_msec`. For each increment + in coarseness, fio outputs half as many bins. Defaults to 0, for which + histogram logs contain 1216 latency bins. See `Log File Formats`_. + +.. option:: log_max_value=bool + + If :option:`log_avg_msec` is set, fio logs the average over that window. If + you instead want to log the maximum value, set this option to 1. Defaults to + 0, meaning that averaged values are logged. + +.. option:: log_offset=bool + + If this is set, the iolog options will include the byte offset for the I/O + entry as well as the other data values. Defaults to 0 meaning that + offsets are not present in logs. Also see `Log File Formats`_. + +.. option:: log_compression=int + + If this is set, fio will compress the I/O logs as it goes, to keep the + memory footprint lower. When a log reaches the specified size, that chunk is + removed and compressed in the background. Given that I/O logs are fairly + highly compressible, this yields a nice memory savings for longer runs. The + downside is that the compression will consume some background CPU cycles, so + it may impact the run. This, however, is also true if the logging ends up + consuming most of the system memory. So pick your poison. The I/O logs are + saved normally at the end of a run, by decompressing the chunks and storing + them in the specified log file. This feature depends on the availability of + zlib. + +.. option:: log_compression_cpus=str + + Define the set of CPUs that are allowed to handle online log compression for + the I/O jobs. This can provide better isolation between performance + sensitive jobs, and background compression work. + +.. option:: log_store_compressed=bool + + If set, fio will store the log files in a compressed format. They can be + decompressed with fio, using the :option:`--inflate-log` command line + parameter. The files will be stored with a :file:`.fz` suffix. + +.. option:: log_unix_epoch=bool + + If set, fio will log Unix timestamps to the log files produced by enabling + write_type_log for each log type, instead of the default zero-based + timestamps. + +.. option:: block_error_percentiles=bool + + If set, record errors in trim block-sized units from writes and trims and + output a histogram of how many trims it took to get to errors, and what kind + of error was encountered. + +.. option:: bwavgtime=int + + Average the calculated bandwidth over the given time. Value is specified in + milliseconds. If the job also does bandwidth logging through + :option:`write_bw_log`, then the minimum of this option and + :option:`log_avg_msec` will be used. Default: 500ms. + +.. option:: iopsavgtime=int + + Average the calculated IOPS over the given time. Value is specified in + milliseconds. If the job also does IOPS logging through + :option:`write_iops_log`, then the minimum of this option and + :option:`log_avg_msec` will be used. Default: 500ms. + +.. option:: disk_util=bool + + Generate disk utilization statistics, if the platform supports it. + Default: true. + +.. option:: disable_lat=bool + + Disable measurements of total latency numbers. Useful only for cutting back + the number of calls to :manpage:`gettimeofday(2)`, as that does impact + performance at really high IOPS rates. Note that to really get rid of a + large amount of these calls, this option must be used with + :option:`disable_slat` and :option:`disable_bw_measurement` as well. + +.. option:: disable_clat=bool + + Disable measurements of completion latency numbers. See + :option:`disable_lat`. + +.. option:: disable_slat=bool + + Disable measurements of submission latency numbers. See + :option:`disable_lat`. + +.. option:: disable_bw_measurement=bool, disable_bw=bool + + Disable measurements of throughput/bandwidth numbers. See + :option:`disable_lat`. + +.. option:: clat_percentiles=bool + + Enable the reporting of percentiles of completion latencies. This + option is mutually exclusive with :option:`lat_percentiles`. + +.. option:: lat_percentiles=bool + + Enable the reporting of percentiles of IO latencies. This is similar + to :option:`clat_percentiles`, except that this includes the + submission latency. This option is mutually exclusive with + :option:`clat_percentiles`. + +.. option:: percentile_list=float_list + + Overwrite the default list of percentiles for completion latencies and the + block error histogram. Each number is a floating number in the range + (0,100], and the maximum length of the list is 20. Use ``:`` to separate the + numbers, and list the numbers in ascending order. For example, + ``--percentile_list=99.5:99.9`` will cause fio to report the values of + completion latency below which 99.5% and 99.9% of the observed latencies + fell, respectively. + + +Error handling +~~~~~~~~~~~~~~ + +.. option:: exitall_on_error + + When one job finishes in error, terminate the rest. The default is to wait + for each job to finish. + +.. option:: continue_on_error=str + + Normally fio will exit the job on the first observed failure. If this option + is set, fio will continue the job when there is a 'non-fatal error' (EIO or + EILSEQ) until the runtime is exceeded or the I/O size specified is + completed. If this option is used, there are two more stats that are + appended, the total error count and the first error. The error field given + in the stats is the first error that was hit during the run. + + The allowed values are: + + **none** + Exit on any I/O or verify errors. + + **read** + Continue on read errors, exit on all others. + + **write** + Continue on write errors, exit on all others. + + **io** + Continue on any I/O error, exit on all others. + + **verify** + Continue on verify errors, exit on all others. + + **all** + Continue on all errors. + + **0** + Backward-compatible alias for 'none'. + + **1** + Backward-compatible alias for 'all'. + +.. option:: ignore_error=str + + Sometimes you want to ignore some errors during test in that case you can + specify error list for each error type, instead of only being able to + ignore the default 'non-fatal error' using :option:`continue_on_error`. + ``ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST`` errors for + given error type is separated with ':'. Error may be symbol ('ENOSPC', + 'ENOMEM') or integer. Example:: + + ignore_error=EAGAIN,ENOSPC:122 + + This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from + WRITE. This option works by overriding :option:`continue_on_error` with + the list of errors for each error type if any. + +.. option:: error_dump=bool + + If set dump every error even if it is non fatal, true by default. If + disabled only fatal error will be dumped. + +Running predefined workloads +---------------------------- + +Fio includes predefined profiles that mimic the I/O workloads generated by +other tools. + +.. option:: profile=str + + The predefined workload to run. Current profiles are: + + **tiobench** + Threaded I/O bench (tiotest/tiobench) like workload. + + **act** + Aerospike Certification Tool (ACT) like workload. + +To view a profile's additional options use :option:`--cmdhelp` after specifying +the profile. For example:: + + $ fio --profile=act --cmdhelp + +Act profile options +~~~~~~~~~~~~~~~~~~~ + +.. option:: device-names=str + :noindex: + + Devices to use. + +.. option:: load=int + :noindex: + + ACT load multiplier. Default: 1. + +.. option:: test-duration=time + :noindex: + + How long the entire test takes to run. When the unit is omitted, the value + is given in seconds. Default: 24h. + +.. option:: threads-per-queue=int + :noindex: + + Number of read I/O threads per device. Default: 8. + +.. option:: read-req-num-512-blocks=int + :noindex: + + Number of 512B blocks to read at the time. Default: 3. + +.. option:: large-block-op-kbytes=int + :noindex: + + Size of large block ops in KiB (writes). Default: 131072. + +.. option:: prep + :noindex: + + Set to run ACT prep phase. + +Tiobench profile options +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. option:: size=str + :noindex: + + Size in MiB. + +.. option:: block=int + :noindex: + + Block size in bytes. Default: 4096. + +.. option:: numruns=int + :noindex: + + Number of runs. + +.. option:: dir=str + :noindex: + + Test directory. + +.. option:: threads=int + :noindex: + + Number of threads. + +Interpreting the output +----------------------- + +.. + Example output was based on the following: + TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --time_based \ + --rate=1256k --bs=14K --name=quick --runtime=1s --name=mixed \ + --runtime=2m --rw=rw + +Fio spits out a lot of output. While running, fio will display the status of the +jobs created. An example of that would be:: + + Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s] + +The characters inside the first set of square brackets denote the current status of +each thread. The first character is the first job defined in the job file, and so +forth. The possible values (in typical life cycle order) are: + ++------+-----+-----------------------------------------------------------+ +| Idle | Run | | ++======+=====+===========================================================+ +| P | | Thread setup, but not started. | ++------+-----+-----------------------------------------------------------+ +| C | | Thread created. | ++------+-----+-----------------------------------------------------------+ +| I | | Thread initialized, waiting or generating necessary data. | ++------+-----+-----------------------------------------------------------+ +| | p | Thread running pre-reading file(s). | ++------+-----+-----------------------------------------------------------+ +| | / | Thread is in ramp period. | ++------+-----+-----------------------------------------------------------+ +| | R | Running, doing sequential reads. | ++------+-----+-----------------------------------------------------------+ +| | r | Running, doing random reads. | ++------+-----+-----------------------------------------------------------+ +| | W | Running, doing sequential writes. | ++------+-----+-----------------------------------------------------------+ +| | w | Running, doing random writes. | ++------+-----+-----------------------------------------------------------+ +| | M | Running, doing mixed sequential reads/writes. | ++------+-----+-----------------------------------------------------------+ +| | m | Running, doing mixed random reads/writes. | ++------+-----+-----------------------------------------------------------+ +| | D | Running, doing sequential trims. | ++------+-----+-----------------------------------------------------------+ +| | d | Running, doing random trims. | ++------+-----+-----------------------------------------------------------+ +| | F | Running, currently waiting for :manpage:`fsync(2)`. | ++------+-----+-----------------------------------------------------------+ +| | V | Running, doing verification of written data. | ++------+-----+-----------------------------------------------------------+ +| f | | Thread finishing. | ++------+-----+-----------------------------------------------------------+ +| E | | Thread exited, not reaped by main thread yet. | ++------+-----+-----------------------------------------------------------+ +| _ | | Thread reaped. | ++------+-----+-----------------------------------------------------------+ +| X | | Thread reaped, exited with an error. | ++------+-----+-----------------------------------------------------------+ +| K | | Thread reaped, exited due to signal. | ++------+-----+-----------------------------------------------------------+ + +.. + Example output was based on the following: + TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --runtime=58m \ + --time_based --rate=2512k --bs=256K --numjobs=10 \ + --name=readers --rw=read --name=writers --rw=write + +Fio will condense the thread string as not to take up more space on the command +line than needed. For instance, if you have 10 readers and 10 writers running, +the output would look like this:: + + Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s] + +Note that the status string is displayed in order, so it's possible to tell which of +the jobs are currently doing what. In the example above this means that jobs 1--10 +are readers and 11--20 are writers. + +The other values are fairly self explanatory -- number of threads currently +running and doing I/O, the number of currently open files (f=), the estimated +completion percentage, the rate of I/O since last check (read speed listed first, +then write speed and optionally trim speed) in terms of bandwidth and IOPS, +and time to completion for the current running group. It's impossible to estimate +runtime of the following groups (if any). + +.. + Example output was based on the following: + TZ=UTC fio --iodepth=16 --ioengine=posixaio --filename=/tmp/fiofile \ + --direct=1 --size=100M --time_based --runtime=50s --rate_iops=89 \ + --bs=7K --name=Client1 --rw=write + +When fio is done (or interrupted by :kbd:`Ctrl-C`), it will show the data for +each thread, group of threads, and disks in that order. For each overall thread (or +group) the output looks like:: + + Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017 + write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec) + slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50 + clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31 + lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79 + clat percentiles (usec): + | 1.00th=[ 302], 5.00th=[ 326], 10.00th=[ 343], 20.00th=[ 363], + | 30.00th=[ 392], 40.00th=[ 404], 50.00th=[ 416], 60.00th=[ 445], + | 70.00th=[ 816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627], + | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877], + | 99.99th=[78119] + bw ( KiB/s): min= 532, max= 686, per=0.10%, avg=622.87, stdev=24.82, samples= 100 + iops : min= 76, max= 98, avg=88.98, stdev= 3.54, samples= 100 + lat (usec) : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79% + lat (msec) : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37% + lat (msec) : 100=0.65% + cpu : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21 + IO depths : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0% + submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% + complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% + issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0 + latency : target=0, window=0, percentile=100.00%, depth=8 + +The job name (or first job's name when using :option:`group_reporting`) is printed, +along with the group id, count of jobs being aggregated, last error id seen (which +is 0 when there are no errors), pid/tid of that thread and the time the job/group +completed. Below are the I/O statistics for each data direction performed (showing +writes in the example above). In the order listed, they denote: + +**read/write/trim** + The string before the colon shows the I/O direction the statistics + are for. **IOPS** is the average I/Os performed per second. **BW** + is the average bandwidth rate shown as: value in power of 2 format + (value in power of 10 format). The last two values show: (**total + I/O performed** in power of 2 format / **runtime** of that thread). + +**slat** + Submission latency (**min** being the minimum, **max** being the + maximum, **avg** being the average, **stdev** being the standard + deviation). This is the time it took to submit the I/O. For + sync I/O this row is not displayed as the slat is really the + completion latency (since queue/complete is one operation there). + This value can be in nanoseconds, microseconds or milliseconds --- + fio will choose the most appropriate base and print that (in the + example above nanoseconds was the best scale). Note: in :option:`--minimal` mode latencies are always expressed in microseconds. - clat= Completion latency. Same names as slat, this denotes the - time from submission to completion of the io pieces. For - sync io, clat will usually be equal (or very close) to 0, - as the time from submit to complete is basically just - CPU time (io has already been done, see slat explanation). - bw= Bandwidth. Same names as the xlat stats, but also includes - an approximate percentage of total aggregate bandwidth - this thread received in this group. This last value is - only really useful if the threads in this group are on the - same disk, since they are then competing for disk access. -cpu= CPU usage. User and system time, along with the number - of context switches this thread went through, usage of - system and user time, and finally the number of major - and minor page faults. The CPU utilization numbers are - averages for the jobs in that reporting group, while the - context and fault counters are summed. -IO depths= The distribution of io depths over the job life time. The - numbers are divided into powers of 2, so for example the - 16= entries includes depths up to that value but higher - than the previous entry. In other words, it covers the - range from 16 to 31. -IO submit= How many pieces of IO were submitting in a single submit - call. Each entry denotes that amount and below, until - the previous entry - eg, 8=100% mean that we submitted - anywhere in between 5-8 ios per submit call. -IO complete= Like the above submit number, but for completions instead. -IO issued= The number of read/write requests issued, and how many - of them were short. -IO latencies= The distribution of IO completion latencies. This is the - time from when IO leaves fio and when it gets completed. - The numbers follow the same pattern as the IO depths, - meaning that 2=1.6% means that 1.6% of the IO completed - within 2 msecs, 20=12.8% means that 12.8% of the IO - took more than 10 msecs, but less than (or equal to) 20 msecs. -After each client has been listed, the group statistics are printed. They -will look like this: +**clat** + Completion latency. Same names as slat, this denotes the time from + submission to completion of the I/O pieces. For sync I/O, clat will + usually be equal (or very close) to 0, as the time from submit to + complete is basically just CPU time (I/O has already been done, see slat + explanation). + +**lat** + Total latency. Same names as slat and clat, this denotes the time from + when fio created the I/O unit to completion of the I/O operation. + +**bw** + Bandwidth statistics based on samples. Same names as the xlat stats, + but also includes the number of samples taken (**samples**) and an + approximate percentage of total aggregate bandwidth this thread + received in its group (**per**). This last value is only really + useful if the threads in this group are on the same disk, since they + are then competing for disk access. + +**iops** + IOPS statistics based on samples. Same names as bw. + +**lat (nsec/usec/msec)** + The distribution of I/O completion latencies. This is the time from when + I/O leaves fio and when it gets completed. Unlike the separate + read/write/trim sections above, the data here and in the remaining + sections apply to all I/Os for the reporting group. 250=0.04% means that + 0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11% + of the I/Os required 250 to 499us for completion. + +**cpu** + CPU usage. User and system time, along with the number of context + switches this thread went through, usage of system and user time, and + finally the number of major and minor page faults. The CPU utilization + numbers are averages for the jobs in that reporting group, while the + context and fault counters are summed. -Run status group 0 (all jobs): - READ: io=64MB, aggrb=22178, minb=11355, maxb=11814, mint=2840msec, maxt=2955msec - WRITE: io=64MB, aggrb=1302, minb=666, maxb=669, mint=50093msec, maxt=50320msec +**IO depths** + The distribution of I/O depths over the job lifetime. The numbers are + divided into powers of 2 and each entry covers depths from that value + up to those that are lower than the next entry -- e.g., 16= covers + depths from 16 to 31. Note that the range covered by a depth + distribution entry can be different to the range covered by the + equivalent submit/complete distribution entry. + +**IO submit** + How many pieces of I/O were submitting in a single submit call. Each + entry denotes that amount and below, until the previous entry -- e.g., + 16=100% means that we submitted anywhere between 9 to 16 I/Os per submit + call. Note that the range covered by a submit distribution entry can + be different to the range covered by the equivalent depth distribution + entry. + +**IO complete** + Like the above submit number, but for completions instead. + +**IO issued rwt** + The number of read/write/trim requests issued, and how many of them were + short or dropped. + +**IO latency** + These values are for `--latency-target` and related options. When + these options are engaged, this section describes the I/O depth required + to meet the specified latency target. + +.. + Example output was based on the following: + TZ=UTC fio --ioengine=null --iodepth=2 --size=100M --numjobs=2 \ + --rate_process=poisson --io_limit=32M --name=read --bs=128k \ + --rate=11M --name=write --rw=write --bs=2k --rate=700k -For each data direction, it prints: +After each client has been listed, the group statistics are printed. They +will look like this:: -io= Number of megabytes io performed. -aggrb= Aggregate bandwidth of threads in this group. -minb= The minimum average bandwidth a thread saw. -maxb= The maximum average bandwidth a thread saw. -mint= The smallest runtime of the threads in that group. -maxt= The longest runtime of the threads in that group. + Run status group 0 (all jobs): + READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s-10.8MiB/s (10.9MB/s-11.3MB/s), io=64.0MiB (67.1MB), run=2973-3069msec + WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s-621KiB/s (630kB/s-636kB/s), io=64.0MiB (67.1MB), run=52747-53223msec + +For each data direction it prints: + +**bw** + Aggregate bandwidth of threads in this group followed by the + minimum and maximum bandwidth of all the threads in this group. + Values outside of brackets are power-of-2 format and those + within are the equivalent value in a power-of-10 format. +**io** + Aggregate I/O performed of all threads in this group. The + format is the same as bw. +**run** + The smallest and longest runtimes of the threads in this group. -And finally, the disk statistics are printed. They will look like this: +And finally, the disk statistics are printed. This is Linux specific. They will look like this:: -Disk stats (read/write): - sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00% + Disk stats (read/write): + sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00% Each value is printed for both reads and writes, with reads first. The numbers denote: -ios= Number of ios performed by all groups. -merge= Number of merges io the io scheduler. -ticks= Number of ticks we kept the disk busy. -io_queue= Total time spent in the disk queue. -util= The disk utilization. A value of 100% means we kept the disk +**ios** + Number of I/Os performed by all groups. +**merge** + Number of merges performed by the I/O scheduler. +**ticks** + Number of ticks we kept the disk busy. +**in_queue** + Total time spent in the disk queue. +**util** + The disk utilization. A value of 100% means we kept the disk busy constantly, 50% would be a disk idling half of the time. -It is also possible to get fio to dump the current output while it is -running, without terminating the job. To do that, send fio the USR1 signal. -You can also get regularly timed dumps by using the --status-interval -parameter, or by creating a file in /tmp named fio-dump-status. If fio -sees this file, it will unlink it and dump the current output status. +It is also possible to get fio to dump the current output while it is running, +without terminating the job. To do that, send fio the **USR1** signal. You can +also get regularly timed dumps by using the :option:`--status-interval` +parameter, or by creating a file in :file:`/tmp` named +:file:`fio-dump-status`. If fio sees this file, it will unlink it and dump the +current output status. + + +Terse output +------------ + +For scripted usage where you typically want to generate tables or graphs of the +results, fio can output the results in a semicolon separated format. The format +is one long line of values, such as:: + 2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00% + A description of this job goes here. -7.0 Terse output ----------------- +The job description (if provided) follows on a second line. -For scripted usage where you typically want to generate tables or graphs -of the results, fio can output the results in a semicolon separated format. -The format is one long line of values, such as: +To enable terse output, use the :option:`--minimal` or +:option:`--output-format`\=terse command line options. The +first value is the version of the terse output format. If the output has to be +changed for some reason, this number will be incremented by 1 to signify that +change. -2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00% -A description of this job goes here. +Split up, the format is as follows (comments in brackets denote when a +field was introduced or whether it's specific to some terse version): -The job description (if provided) follows on a second line. + :: + + terse version, fio version [v3], jobname, groupid, error + + READ status:: + + Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec) + Submission latency: min, max, mean, stdev (usec) + Completion latency: min, max, mean, stdev (usec) + Completion latency percentiles: 20 fields (see below) + Total latency: min, max, mean, stdev (usec) + Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5] + IOPS [v5]: min, max, mean, stdev, number of samples + + WRITE status: + + :: + + Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec) + Submission latency: min, max, mean, stdev (usec) + Completion latency: min, max, mean, stdev (usec) + Completion latency percentiles: 20 fields (see below) + Total latency: min, max, mean, stdev (usec) + Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5] + IOPS [v5]: min, max, mean, stdev, number of samples + + TRIM status [all but version 3]: + + Fields are similar to READ/WRITE status. + + CPU usage:: + + user, system, context switches, major faults, minor faults + + I/O depths:: + + <=1, 2, 4, 8, 16, 32, >=64 + + I/O latencies microseconds:: + + <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000 + + I/O latencies milliseconds:: + + <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000 + + Disk utilization [v3]:: + + disk name, read ios, write ios, read merges, write merges, read ticks, write ticks, + time spent in queue, disk utilization percentage + + Additional Info (dependent on continue_on_error, default off):: + + total # errors, first error code + + Additional Info (dependent on description being set):: + + Text description + +Completion latency percentiles can be a grouping of up to 20 sets, so for the +terse output fio writes all of them. Each field will look like this:: -To enable terse output, use the --minimal command line option. The first -value is the version of the terse output format. If the output has to -be changed for some reason, this number will be incremented by 1 to -signify that change. - -Split up, the format is as follows: - - terse version, fio version, jobname, groupid, error - READ status: - Total IO (KB), bandwidth (KB/sec), IOPS, runtime (msec) - Submission latency: min, max, mean, stdev (usec) - Completion latency: min, max, mean, stdev (usec) - Completion latency percentiles: 20 fields (see below) - Total latency: min, max, mean, stdev (usec) - Bw (KB/s): min, max, aggregate percentage of total, mean, stdev - WRITE status: - Total IO (KB), bandwidth (KB/sec), IOPS, runtime (msec) - Submission latency: min, max, mean, stdev (usec) - Completion latency: min, max, mean, stdev(usec) - Completion latency percentiles: 20 fields (see below) - Total latency: min, max, mean, stdev (usec) - Bw (KB/s): min, max, aggregate percentage of total, mean, stdev - CPU usage: user, system, context switches, major faults, minor faults - IO depths: <=1, 2, 4, 8, 16, 32, >=64 - IO latencies microseconds: <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000 - IO latencies milliseconds: <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000 - Disk utilization: Disk name, Read ios, write ios, - Read merges, write merges, - Read ticks, write ticks, - Time spent in queue, disk utilization percentage - Additional Info (dependent on continue_on_error, default off): total # errors, first error code - - Additional Info (dependent on description being set): Text description - -Completion latency percentiles can be a grouping of up to 20 sets, so -for the terse output fio writes all of them. Each field will look like this: - - 1.00%=6112 - -which is the Xth percentile, and the usec latency associated with it. - -For disk utilization, all disks used by fio are shown. So for each disk -there will be a disk utilization section. - - -8.0 Trace file format ---------------------- -There are two trace file format that you can encounter. The older (v1) format -is unsupported since version 1.20-rc3 (March 2008). It will still be described + 1.00%=6112 + +which is the Xth percentile, and the `usec` latency associated with it. + +For `Disk utilization`, all disks used by fio are shown. So for each disk there +will be a disk utilization section. + +Below is a single line containing short names for each of the fields in the +minimal output v3, separated by semicolons:: + + terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util + + +JSON output +------------ + +The `json` output format is intended to be both human readable and convenient +for automated parsing. For the most part its sections mirror those of the +`normal` output. The `runtime` value is reported in msec and the `bw` value is +reported in 1024 bytes per second units. + + +JSON+ output +------------ + +The `json+` output format is identical to the `json` output format except that it +adds a full dump of the completion latency bins. Each `bins` object contains a +set of (key, value) pairs where keys are latency durations and values count how +many I/Os had completion latencies of the corresponding duration. For example, +consider: + + "bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... } + +This data indicates that one I/O required 87,552ns to complete, two I/Os required +100,864ns to complete, and 7529 I/Os required 107,008ns to complete. + +Also included with fio is a Python script `fio_jsonplus_clat2csv` that takes +json+ output and generates CSV-formatted latency data suitable for plotting. + +The latency durations actually represent the midpoints of latency intervals. +For details refer to :file:`stat.h`. + + +Trace file format +----------------- + +There are two trace file format that you can encounter. The older (v1) format is +unsupported since version 1.20-rc3 (March 2008). It will still be described below in case that you get an old trace and want to understand it. In any case the trace is a simple text file with a single action per line. -8.1 Trace file format v1 ------------------------- -Each line represents a single io action in the following format: +Trace file format v1 +~~~~~~~~~~~~~~~~~~~~ + +Each line represents a single I/O action in the following format:: + + rw, offset, length -rw, offset, length +where `rw=0/1` for read/write, and the `offset` and `length` entries being in bytes. -where rw=0/1 for read/write, and the offset and length entries being in bytes. +This format is not supported in fio versions >= 1.20-rc3. -This format is not supported in Fio versions => 1.20-rc3. +Trace file format v2 +~~~~~~~~~~~~~~~~~~~~ -8.2 Trace file format v2 ------------------------- -The second version of the trace file format was added in Fio version 1.17. -It allows to access more then one file per trace and has a bigger set of -possible file actions. +The second version of the trace file format was added in fio version 1.17. It +allows to access more then one file per trace and has a bigger set of possible +file actions. -The first line of the trace file has to be: +The first line of the trace file has to be:: -fio version 2 iolog + fio version 2 iolog Following this can be lines in two different formats, which are described below. -The file management format: +The file management format:: -filename action + filename action -The filename is given as an absolute path. The action can be one of these: +The `filename` is given as an absolute path. The `action` can be one of these: + +**add** + Add the given `filename` to the trace. +**open** + Open the file with the given `filename`. The `filename` has to have + been added with the **add** action before. +**close** + Close the file with the given `filename`. The file has to have been + opened before. + + +The file I/O action format:: + + filename action offset length + +The `filename` is given as an absolute path, and has to have been added and +opened before it can be used with this format. The `offset` and `length` are +given in bytes. The `action` can be one of these: + +**wait** + Wait for `offset` microseconds. Everything below 100 is discarded. + The time is relative to the previous `wait` statement. +**read** + Read `length` bytes beginning from `offset`. +**write** + Write `length` bytes beginning from `offset`. +**sync** + :manpage:`fsync(2)` the file. +**datasync** + :manpage:`fdatasync(2)` the file. +**trim** + Trim the given file from the given `offset` for `length` bytes. + +CPU idleness profiling +---------------------- + +In some cases, we want to understand CPU overhead in a test. For example, we +test patches for the specific goodness of whether they reduce CPU usage. +Fio implements a balloon approach to create a thread per CPU that runs at idle +priority, meaning that it only runs when nobody else needs the cpu. +By measuring the amount of work completed by the thread, idleness of each CPU +can be derived accordingly. + +An unit work is defined as touching a full page of unsigned characters. Mean and +standard deviation of time to complete an unit work is reported in "unit work" +section. Options can be chosen to report detailed percpu idleness or overall +system idleness by aggregating percpu stats. + + +Verification and triggers +------------------------- -add Add the given filename to the trace -open Open the file with the given filename. The filename has to have - been added with the add action before. -close Close the file with the given filename. The file has to have been - opened before. - - -The file io action format: - -filename action offset length - -The filename is given as an absolute path, and has to have been added and opened -before it can be used with this format. The offset and length are given in -bytes. The action can be one of these: - -wait Wait for 'offset' microseconds. Everything below 100 is discarded. - The time is relative to the previous wait statement. -read Read 'length' bytes beginning from 'offset' -write Write 'length' bytes beginning from 'offset' -sync fsync() the file -datasync fdatasync() the file -trim trim the given file from the given 'offset' for 'length' bytes - - -9.0 CPU idleness profiling --------------------------- -In some cases, we want to understand CPU overhead in a test. For example, -we test patches for the specific goodness of whether they reduce CPU usage. -fio implements a balloon approach to create a thread per CPU that runs at -idle priority, meaning that it only runs when nobody else needs the cpu. -By measuring the amount of work completed by the thread, idleness of each -CPU can be derived accordingly. - -An unit work is defined as touching a full page of unsigned characters. Mean -and standard deviation of time to complete an unit work is reported in "unit -work" section. Options can be chosen to report detailed percpu idleness or -overall system idleness by aggregating percpu stats. - - -10.0 Verification and triggers ------------------------------- -Fio is usually run in one of two ways, when data verification is done. The -first is a normal write job of some sort with verify enabled. When the -write phase has completed, fio switches to reads and verifies everything -it wrote. The second model is running just the write phase, and then later -on running the same job (but with reads instead of writes) to repeat the -same IO patterns and verify the contents. Both of these methods depend -on the write phase being completed, as fio otherwise has no idea how much -data was written. - -With verification triggers, fio supports dumping the current write state -to local files. Then a subsequent read verify workload can load this state -and know exactly where to stop. This is useful for testing cases where -power is cut to a server in a managed fashion, for instance. +Fio is usually run in one of two ways, when data verification is done. The first +is a normal write job of some sort with verify enabled. When the write phase has +completed, fio switches to reads and verifies everything it wrote. The second +model is running just the write phase, and then later on running the same job +(but with reads instead of writes) to repeat the same I/O patterns and verify +the contents. Both of these methods depend on the write phase being completed, +as fio otherwise has no idea how much data was written. + +With verification triggers, fio supports dumping the current write state to +local files. Then a subsequent read verify workload can load this state and know +exactly where to stop. This is useful for testing cases where power is cut to a +server in a managed fashion, for instance. A verification trigger consists of two things: -1) Storing the write state of each job -2) Executing a trigger command +1) Storing the write state of each job. +2) Executing a trigger command. -The write state is relatively small, on the order of hundreds of bytes -to single kilobytes. It contains information on the number of completions -done, the last X completions, etc. - -A trigger is invoked either through creation ('touch') of a specified -file in the system, or through a timeout setting. If fio is run with ---trigger-file=/tmp/trigger-file, then it will continually check for -the existence of /tmp/trigger-file. When it sees this file, it will -fire off the trigger (thus saving state, and executing the trigger +The write state is relatively small, on the order of hundreds of bytes to single +kilobytes. It contains information on the number of completions done, the last X +completions, etc. + +A trigger is invoked either through creation ('touch') of a specified file in +the system, or through a timeout setting. If fio is run with +:option:`--trigger-file`\= :file:`/tmp/trigger-file`, then it will continually +check for the existence of :file:`/tmp/trigger-file`. When it sees this file, it +will fire off the trigger (thus saving state, and executing the trigger command). -For client/server runs, there's both a local and remote trigger. If -fio is running as a server backend, it will send the job states back -to the client for safe storage, then execute the remote trigger, if -specified. If a local trigger is specified, the server will still send -back the write state, but the client will then execute the trigger. +For client/server runs, there's both a local and remote trigger. If fio is +running as a server backend, it will send the job states back to the client for +safe storage, then execute the remote trigger, if specified. If a local trigger +is specified, the server will still send back the write state, but the client +will then execute the trigger. -10.1 Verification trigger example ---------------------------------- -Lets say we want to run a powercut test on the remote machine 'server'. -Our write workload is in write-test.fio. We want to cut power to 'server' -at some point during the run, and we'll run this test from the safety -or our local machine, 'localbox'. On the server, we'll start the fio -backend normally: +Verification trigger example +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -server# fio --server +Let's say we want to run a powercut test on the remote Linux machine 'server'. +Our write workload is in :file:`write-test.fio`. We want to cut power to 'server' at +some point during the run, and we'll run this test from the safety or our local +machine, 'localbox'. On the server, we'll start the fio backend normally:: -and on the client, we'll fire off the workload: + server# fio --server -localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\"" +and on the client, we'll fire off the workload:: -We set /tmp/my-trigger as the trigger file, and we tell fio to execute + localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\"" -echo b > /proc/sysrq-trigger +We set :file:`/tmp/my-trigger` as the trigger file, and we tell fio to execute:: -on the server once it has received the trigger and sent us the write -state. This will work, but it's not _really_ cutting power to the server, -it's merely abruptly rebooting it. If we have a remote way of cutting -power to the server through IPMI or similar, we could do that through -a local trigger command instead. Lets assume we have a script that does -IPMI reboot of a given hostname, ipmi-reboot. On localbox, we could -then have run fio with a local trigger instead: + echo b > /proc/sysrq-trigger -localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server" +on the server once it has received the trigger and sent us the write state. This +will work, but it's not **really** cutting power to the server, it's merely +abruptly rebooting it. If we have a remote way of cutting power to the server +through IPMI or similar, we could do that through a local trigger command +instead. Let's assume we have a script that does IPMI reboot of a given hostname, +ipmi-reboot. On localbox, we could then have run fio with a local trigger +instead:: -For this case, fio would wait for the server to send us the write state, -then execute 'ipmi-reboot server' when that happened. + localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server" -10.2 Loading verify state -------------------------- -To load store write state, read verification job file must contain -the verify_state_load option. If that is set, fio will load the previously +For this case, fio would wait for the server to send us the write state, then +execute ``ipmi-reboot server`` when that happened. + +Loading verify state +~~~~~~~~~~~~~~~~~~~~ + +To load stored write state, a read verification job file must contain the +:option:`verify_state_load` option. If that is set, fio will load the previously stored state. For a local fio run this is done by loading the files directly, -and on a client/server run, the server backend will ask the client to send -the files over and load them from there. +and on a client/server run, the server backend will ask the client to send the +files over and load them from there. -11.0 Log File Formats ---------------------- +Log File Formats +---------------- Fio supports a variety of log file formats, for logging latencies, bandwidth, and IOPS. The logs share a common format, which looks like this: -time (msec), value, data direction, offset + *time* (`msec`), *value*, *data direction*, *block size* (`bytes`), + *offset* (`bytes`) -Time for the log entry is always in milliseconds. The value logged depends +*Time* for the log entry is always in milliseconds. The *value* logged depends on the type of log, it will be one of the following: - Latency log Value is latency in usecs - Bandwidth log Value is in KB/sec - IOPS log Value is IOPS - -Data direction is one of the following: - - 0 IO is a READ - 1 IO is a WRITE - 2 IO is a TRIM - -The offset is the offset, in bytes, from the start of the file, for that -particular IO. The logging of the offset can be toggled with 'log_offset'. - -If windowed logging is enabled through 'log_avg_msec', then fio doesn't log -individual IOs. Instead of logs the average values over the specified -period of time. Since 'data direction' and 'offset' are per-IO values, -they aren't applicable if windowed logging is enabled. If windowed logging -is enabled and 'log_max_value' is set, then fio logs maximum values in -that window instead of averages. + **Latency log** + Value is latency in nsecs + **Bandwidth log** + Value is in KiB/sec + **IOPS log** + Value is IOPS + +*Data direction* is one of the following: + + **0** + I/O is a READ + **1** + I/O is a WRITE + **2** + I/O is a TRIM + +The entry's *block size* is always in bytes. The *offset* is the offset, in bytes, +from the start of the file, for that particular I/O. The logging of the offset can be +toggled with :option:`log_offset`. + +Fio defaults to logging every individual I/O. When IOPS are logged for individual +I/Os the *value* entry will always be 1. If windowed logging is enabled through +:option:`log_avg_msec`, fio logs the average values over the specified period of time. +If windowed logging is enabled and :option:`log_max_value` is set, then fio logs +maximum values in that window instead of averages. Since *data direction*, *block +size* and *offset* are per-I/O values, if windowed logging is enabled they +aren't applicable and will be 0. + +Client/Server +------------- + +Normally fio is invoked as a stand-alone application on the machine where the +I/O workload should be generated. However, the backend and frontend of fio can +be run separately i.e., the fio server can generate an I/O workload on the "Device +Under Test" while being controlled by a client on another machine. + +Start the server on the machine which has access to the storage DUT:: + + $ fio --server=args + +where `args` defines what fio listens to. The arguments are of the form +``type,hostname`` or ``IP,port``. *type* is either ``ip`` (or ip4) for TCP/IP +v4, ``ip6`` for TCP/IP v6, or ``sock`` for a local unix domain socket. +*hostname* is either a hostname or IP address, and *port* is the port to listen +to (only valid for TCP/IP, not a local socket). Some examples: + +1) ``fio --server`` + + Start a fio server, listening on all interfaces on the default port (8765). + +2) ``fio --server=ip:hostname,4444`` + + Start a fio server, listening on IP belonging to hostname and on port 4444. + +3) ``fio --server=ip6:::1,4444`` + + Start a fio server, listening on IPv6 localhost ::1 and on port 4444. + +4) ``fio --server=,4444`` + + Start a fio server, listening on all interfaces on port 4444. + +5) ``fio --server=1.2.3.4`` + + Start a fio server, listening on IP 1.2.3.4 on the default port. + +6) ``fio --server=sock:/tmp/fio.sock`` + + Start a fio server, listening on the local socket :file:`/tmp/fio.sock`. + +Once a server is running, a "client" can connect to the fio server with:: + + fio --client= + +where `local-args` are arguments for the client where it is running, `server` +is the connect string, and `remote-args` and `job file(s)` are sent to the +server. The `server` string follows the same format as it does on the server +side, to allow IP/hostname/socket and port strings. + +Fio can connect to multiple servers this way:: + + fio --client= --client= + +If the job file is located on the fio server, then you can tell the server to +load a local file as well. This is done by using :option:`--remote-config` :: + + fio --client=server --remote-config /path/to/file.fio + +Then fio will open this local (to the server) job file instead of being passed +one from the client. + +If you have many servers (example: 100 VMs/containers), you can input a pathname +of a file containing host IPs/names as the parameter value for the +:option:`--client` option. For example, here is an example :file:`host.list` +file containing 2 hostnames:: + + host1.your.dns.domain + host2.your.dns.domain + +The fio command would then be:: + + fio --client=host.list + +In this mode, you cannot input server-specific parameters or job files -- all +servers receive the same job file. + +In order to let ``fio --client`` runs use a shared filesystem from multiple +hosts, ``fio --client`` now prepends the IP address of the server to the +filename. For example, if fio is using the directory :file:`/mnt/nfs/fio` and is +writing filename :file:`fileio.tmp`, with a :option:`--client` `hostfile` +containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and +192.168.10.121, then fio will create two files:: + + /mnt/nfs/fio/192.168.10.120.fileio.tmp + /mnt/nfs/fio/192.168.10.121.fileio.tmp diff -Nru fio-2.16/idletime.c fio-3.1/idletime.c --- fio-2.16/idletime.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/idletime.c 2017-09-28 10:23:20.000000000 +0000 @@ -11,7 +11,7 @@ static double calibrate_unit(unsigned char *data) { unsigned long t, i, j, k; - struct timeval tps; + struct timespec tps; double tunit = 0.0; for (i = 0; i < CALIBRATE_RUNS; i++) { @@ -183,7 +183,6 @@ void fio_idle_prof_init(void) { int i, ret; - struct timeval tp; struct timespec ts; pthread_attr_t tattr; struct idle_prof_thread *ipt; @@ -282,9 +281,8 @@ pthread_mutex_lock(&ipt->init_lock); while ((ipt->state != TD_EXITED) && (ipt->state!=TD_INITIALIZED)) { - fio_gettime(&tp, NULL); - ts.tv_sec = tp.tv_sec + 1; - ts.tv_nsec = tp.tv_usec * 1000; + fio_gettime(&ts, NULL); + ts.tv_sec += 1; pthread_cond_timedwait(&ipt->cond, &ipt->init_lock, &ts); } pthread_mutex_unlock(&ipt->init_lock); @@ -325,7 +323,6 @@ { int i; uint64_t runt; - struct timeval tp; struct timespec ts; struct idle_prof_thread *ipt; @@ -343,9 +340,8 @@ pthread_mutex_lock(&ipt->start_lock); while ((ipt->state != TD_EXITED) && (ipt->state!=TD_NOT_CREATED)) { - fio_gettime(&tp, NULL); - ts.tv_sec = tp.tv_sec + 1; - ts.tv_nsec = tp.tv_usec * 1000; + fio_gettime(&ts, NULL); + ts.tv_sec += 1; /* timed wait in case a signal is not received */ pthread_cond_timedwait(&ipt->cond, &ipt->start_lock, &ts); } diff -Nru fio-2.16/idletime.h fio-3.1/idletime.h --- fio-2.16/idletime.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/idletime.h 2017-09-28 10:23:20.000000000 +0000 @@ -26,8 +26,8 @@ pthread_t thread; int cpu; int state; - struct timeval tps; - struct timeval tpe; + struct timespec tps; + struct timespec tpe; double cali_time; /* microseconds to finish a unit work */ double loops; double idleness; diff -Nru fio-2.16/init.c fio-3.1/init.c --- fio-2.16/init.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/init.c 2017-09-28 10:23:20.000000000 +0000 @@ -31,6 +31,7 @@ #include "oslib/strcasestr.h" #include "crc/test.h" +#include "lib/pow2.h" const char fio_version_string[] = FIO_VERSION; @@ -39,7 +40,6 @@ static char **ini_file; static int max_jobs = FIO_MAX_JOBS; static int dump_cmdline; -static long long def_timeout; static int parse_only; static struct thread_data def_thread; @@ -93,11 +93,6 @@ .val = 'o' | FIO_CLIENT_FLAG, }, { - .name = (char *) "timeout", - .has_arg = required_argument, - .val = 't' | FIO_CLIENT_FLAG, - }, - { .name = (char *) "latency-log", .has_arg = required_argument, .val = 'l' | FIO_CLIENT_FLAG, @@ -361,10 +356,12 @@ perror("shmat"); return 1; } + if (shm_attach_to_open_removed()) + shmctl(shm_id, IPC_RMID, NULL); #endif memset(threads, 0, max_jobs * sizeof(struct thread_data)); - fio_debug_jobp = (void *) threads + max_jobs * sizeof(struct thread_data); + fio_debug_jobp = (unsigned int *)(threads + max_jobs); *fio_debug_jobp = -1; flow_init(); @@ -372,14 +369,6 @@ return 0; } -static void set_cmd_options(struct thread_data *td) -{ - struct thread_options *o = &td->o; - - if (!o->timeout) - o->timeout = def_timeout; -} - static void dump_print_option(struct print_option *p) { const char *delim; @@ -445,15 +434,13 @@ /* * Return a free job structure. */ -static struct thread_data *get_new_job(int global, struct thread_data *parent, - int preserve_eo, const char *jobname) +static struct thread_data *get_new_job(bool global, struct thread_data *parent, + bool preserve_eo, const char *jobname) { struct thread_data *td; - if (global) { - set_cmd_options(&def_thread); + if (global) return &def_thread; - } if (setup_thread_area()) { log_err("error: failed to setup shm segment\n"); return NULL; @@ -472,6 +459,7 @@ copy_opt_list(td, parent); td->io_ops = NULL; + td->io_ops_init = 0; if (!preserve_eo) td->eo = NULL; @@ -491,7 +479,6 @@ if (!parent->o.group_reporting || parent == &def_thread) stat_number++; - set_cmd_options(td); return td; } @@ -536,7 +523,7 @@ td->rate_next_io_time[ddir] = 0; td->rate_io_issue_bytes[ddir] = 0; - td->last_usec = 0; + td->last_usec[ddir] = 0; return 0; } @@ -581,6 +568,17 @@ } /* + * <3 Johannes + */ +static unsigned int gcd(unsigned int m, unsigned int n) +{ + if (!n) + return m; + + return gcd(n, m % n); +} + +/* * Lazy way of fixing up options that depend on each other. We could also * define option callback handlers, but this is easier. */ @@ -589,7 +587,7 @@ struct thread_options *o = &td->o; int ret = 0; -#ifndef FIO_HAVE_PSHARED_MUTEX +#ifndef CONFIG_PSHARED if (!o->use_thread) { log_info("fio: this platform does not support process shared" " mutexes, forcing use of threads. Use the 'thread'" @@ -622,7 +620,7 @@ /* * Reads can do overwrites, we always need to pre-create the file */ - if (td_read(td) || td_rw(td)) + if (td_read(td)) o->overwrite = 1; if (!o->min_bs[DDIR_READ]) @@ -700,6 +698,23 @@ if (o->iodepth_batch_complete_min > o->iodepth_batch_complete_max) o->iodepth_batch_complete_max = o->iodepth_batch_complete_min; + /* + * There's no need to check for in-flight overlapping IOs if the job + * isn't changing data or the maximum iodepth is guaranteed to be 1 + */ + if (o->serialize_overlap && !(td->flags & TD_F_READ_IOLOG) && + (!(td_write(td) || td_trim(td)) || o->iodepth == 1)) + o->serialize_overlap = 0; + /* + * Currently can't check for overlaps in offload mode + */ + if (o->serialize_overlap && o->io_submit_mode == IO_MODE_OFFLOAD) { + log_err("fio: checking for in-flight overlaps when the " + "io_submit_mode is offload is not supported\n"); + o->serialize_overlap = 0; + ret = warnings_fatal; + } + if (o->nr_files > td->files_index) o->nr_files = td->files_index; @@ -733,13 +748,30 @@ o->size = -1ULL; if (o->verify != VERIFY_NONE) { - if (td_write(td) && o->do_verify && o->numjobs > 1) { - log_info("Multiple writers may overwrite blocks that " - "belong to other jobs. This can cause " + if (td_write(td) && o->do_verify && o->numjobs > 1 && + (o->filename || + !(o->unique_filename && + strstr(o->filename_format, "$jobname") && + strstr(o->filename_format, "$jobnum") && + strstr(o->filename_format, "$filenum")))) { + log_info("fio: multiple writers may overwrite blocks " + "that belong to other jobs. This can cause " "verification failures.\n"); ret = warnings_fatal; } + /* + * Warn if verification is requested but no verification of any + * kind can be started due to time constraints + */ + if (td_write(td) && o->do_verify && o->timeout && + o->time_based && !td_read(td) && !o->verify_backlog) { + log_info("fio: verification read phase will never " + "start because write phase uses all of " + "runtime\n"); + ret = warnings_fatal; + } + if (!fio_option_is_set(o, refill_buffers)) o->refill_buffers = 1; @@ -755,10 +787,20 @@ o->verify_interval = o->min_bs[DDIR_WRITE]; else if (td_read(td) && o->verify_interval > o->min_bs[DDIR_READ]) o->verify_interval = o->min_bs[DDIR_READ]; + + /* + * Verify interval must be a factor or both min and max + * write size + */ + if (o->verify_interval % o->min_bs[DDIR_WRITE] || + o->verify_interval % o->max_bs[DDIR_WRITE]) + o->verify_interval = gcd(o->min_bs[DDIR_WRITE], + o->max_bs[DDIR_WRITE]); } if (o->pre_read) { - o->invalidate_cache = 0; + if (o->invalidate_cache) + o->invalidate_cache = 0; if (td_ioengine_flagged(td, FIO_PIPEIO)) { log_info("fio: cannot pre-read files with an IO engine" " that isn't seekable. Pre-read disabled.\n"); @@ -773,6 +815,11 @@ o->unit_base = 8; } +#ifndef FIO_HAVE_ANY_FALLOCATE + /* Platform doesn't support any fallocate so force it to none */ + o->fallocate_mode = FIO_FALLOCATE_NONE; +#endif + #ifndef CONFIG_FDATASYNC if (o->fdatasync_blocks) { log_info("fio: this platform does not support fdatasync()" @@ -790,7 +837,7 @@ * Windows doesn't support O_DIRECT or O_SYNC with the _open interface, * so fail if we're passed those flags */ - if (td_ioengine_flagged(td, FIO_SYNCIO) && (td->o.odirect || td->o.sync_io)) { + if (td_ioengine_flagged(td, FIO_SYNCIO) && (o->odirect || o->sync_io)) { log_err("fio: Windows does not support direct or non-buffered io with" " the synchronous ioengines. Use the 'windowsaio' ioengine" " with 'direct=1' and 'iodepth=1' instead.\n"); @@ -816,8 +863,8 @@ * Using a non-uniform random distribution excludes usage of * a random map */ - if (td->o.random_distribution != FIO_RAND_DIST_RANDOM) - td->o.norandommap = 1; + if (o->random_distribution != FIO_RAND_DIST_RANDOM) + o->norandommap = 1; /* * If size is set but less than the min block size, complain @@ -831,16 +878,16 @@ /* * O_ATOMIC implies O_DIRECT */ - if (td->o.oatomic) - td->o.odirect = 1; + if (o->oatomic) + o->odirect = 1; /* * If randseed is set, that overrides randrepeat */ - if (fio_option_is_set(&td->o, rand_seed)) - td->o.rand_repeatable = 0; + if (fio_option_is_set(o, rand_seed)) + o->rand_repeatable = 0; - if (td_ioengine_flagged(td, FIO_NOEXTEND) && td->o.file_append) { + if (td_ioengine_flagged(td, FIO_NOEXTEND) && o->file_append) { log_err("fio: can't append/extent with IO engine %s\n", td->io_ops->name); ret = 1; } @@ -855,49 +902,28 @@ if (!td->loops) td->loops = 1; - if (td->o.block_error_hist && td->o.nr_files != 1) { + if (o->block_error_hist && o->nr_files != 1) { log_err("fio: block error histogram only available " "with a single file per job, but %d files " - "provided\n", td->o.nr_files); + "provided\n", o->nr_files); ret = 1; } - return ret; -} - -/* - * This function leaks the buffer - */ -char *fio_uint_to_kmg(unsigned int val) -{ - char *buf = malloc(32); - char post[] = { 0, 'K', 'M', 'G', 'P', 'E', 0 }; - char *p = post; - - do { - if (val & 1023) - break; - - val >>= 10; - p++; - } while (*p); - - snprintf(buf, 32, "%u%c", val, *p); - return buf; -} - -/* External engines are specified by "external:name.o") */ -static const char *get_engine_name(const char *str) -{ - char *p = strstr(str, ":"); - - if (!p) - return str; + if (fio_option_is_set(o, clat_percentiles) && + !fio_option_is_set(o, lat_percentiles)) { + o->lat_percentiles = !o->clat_percentiles; + } else if (fio_option_is_set(o, lat_percentiles) && + !fio_option_is_set(o, clat_percentiles)) { + o->clat_percentiles = !o->lat_percentiles; + } else if (fio_option_is_set(o, lat_percentiles) && + fio_option_is_set(o, clat_percentiles) && + o->lat_percentiles && o->clat_percentiles) { + log_err("fio: lat_percentiles and clat_percentiles are " + "mutually exclusive\n"); + ret = 1; + } - p++; - strip_blank_front(&p); - strip_blank_end(p); - return p; + return ret; } static void init_rand_file_service(struct thread_data *td) @@ -922,9 +948,9 @@ bool use64; if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) - use64 = 1; + use64 = true; else - use64 = 0; + use64 = false; init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF], use64); @@ -934,7 +960,22 @@ { int i; - init_rand_seed(&td->bsrange_state, td->rand_seeds[FIO_RAND_BS_OFF], use64); + /* + * trimwrite is special in that we need to generate the same + * offsets to get the "write after trim" effect. If we are + * using bssplit to set buffer length distributions, ensure that + * we seed the trim and write generators identically. + */ + if (td_trimwrite(td)) { + init_rand_seed(&td->bsrange_state[DDIR_READ], td->rand_seeds[FIO_RAND_BS_OFF], use64); + init_rand_seed(&td->bsrange_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_BS1_OFF], use64); + init_rand_seed(&td->bsrange_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_BS1_OFF], use64); + } else { + init_rand_seed(&td->bsrange_state[DDIR_READ], td->rand_seeds[FIO_RAND_BS_OFF], use64); + init_rand_seed(&td->bsrange_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_BS1_OFF], use64); + init_rand_seed(&td->bsrange_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_BS2_OFF], use64); + } + td_fill_verify_state_seed(td); init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], false); @@ -946,7 +987,9 @@ init_rand_seed(&td->file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF], use64); init_rand_seed(&td->trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF], use64); init_rand_seed(&td->delay_state, td->rand_seeds[FIO_RAND_START_DELAY], use64); - init_rand_seed(&td->poisson_state, td->rand_seeds[FIO_RAND_POISSON_OFF], 0); + init_rand_seed(&td->poisson_state[0], td->rand_seeds[FIO_RAND_POISSON_OFF], 0); + init_rand_seed(&td->poisson_state[1], td->rand_seeds[FIO_RAND_POISSON2_OFF], 0); + init_rand_seed(&td->poisson_state[2], td->rand_seeds[FIO_RAND_POISSON3_OFF], 0); init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false); init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false); @@ -978,9 +1021,9 @@ } if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) - use64 = 1; + use64 = true; else - use64 = 0; + use64 = false; td_fill_rand_seeds_internal(td, use64); @@ -994,22 +1037,27 @@ */ int ioengine_load(struct thread_data *td) { - const char *engine; - - /* - * Engine has already been loaded. - */ - if (td->io_ops) - return 0; if (!td->o.ioengine) { log_err("fio: internal fault, no IO engine specified\n"); return 1; } - engine = get_engine_name(td->o.ioengine); - td->io_ops = load_ioengine(td, engine); + if (td->io_ops) { + /* An engine is loaded, but the requested ioengine + * may have changed. + */ + if (!strcmp(td->io_ops->name, td->o.ioengine)) { + /* The right engine is already loaded */ + return 0; + } + + /* Unload the old engine. */ + free_ioengine(td); + } + + td->io_ops = load_ioengine(td); if (!td->io_ops) { - log_err("fio: failed to load engine %s\n", engine); + log_err("fio: failed to load engine\n"); return 1; } @@ -1037,7 +1085,7 @@ */ if (origeo) { memcpy(td->eo, origeo, td->io_ops->option_struct_size); - options_mem_dupe(td->eo, td->io_ops->options); + options_mem_dupe(td->io_ops->options, td->eo); } else { memset(td->eo, 0, td->io_ops->option_struct_size); fill_default_options(td->eo, td->io_ops->options); @@ -1081,6 +1129,9 @@ if (o->verify_async || o->io_submit_mode == IO_MODE_OFFLOAD) td->flags |= TD_F_NEED_LOCK; + + if (o->mem_type == MEM_CUDA_MALLOC) + td->flags &= ~TD_F_SCRAMBLE_BUFFERS; } static int setup_random_seeds(struct thread_data *td) @@ -1088,8 +1139,12 @@ unsigned long seed; unsigned int i; - if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) - return init_random_state(td, td->rand_seeds, sizeof(td->rand_seeds)); + if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) { + int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds)); + if (!ret) + td_fill_rand_seeds(td); + return ret; + } seed = td->o.rand_seed; for (i = 0; i < 4; i++) @@ -1131,7 +1186,7 @@ if (!o->filename_format || !strlen(o->filename_format)) { sprintf(buf, "%s.%d.%d", jobname, jobnum, filenum); - return NULL; + return buf; } for (f = &fpre_keywords[0]; f->keyword; f++) @@ -1360,6 +1415,7 @@ td->mutex = fio_mutex_init(FIO_MUTEX_LOCKED); td->ts.clat_percentiles = o->clat_percentiles; + td->ts.lat_percentiles = o->lat_percentiles; td->ts.percentile_precision = o->percentile_precision; memcpy(td->ts.percentile_list, o->percentile_list, sizeof(o->percentile_list)); @@ -1368,6 +1424,7 @@ td->ts.slat_stat[i].min_val = ULONG_MAX; td->ts.lat_stat[i].min_val = ULONG_MAX; td->ts.bw_stat[i].min_val = ULONG_MAX; + td->ts.iops_stat[i].min_val = ULONG_MAX; } td->ddir_seq_nr = o->ddir_seq_nr; @@ -1384,7 +1441,7 @@ prev_group_jobs++; if (setup_random_seeds(td)) { - td_verror(td, errno, "init_random_state"); + td_verror(td, errno, "setup_random_seeds"); goto err; } @@ -1528,15 +1585,16 @@ if (!td_ioengine_flagged(td, FIO_NOIO)) { char *c1, *c2, *c3, *c4; char *c5 = NULL, *c6 = NULL; + int i2p = is_power_of_2(o->kb_base); - c1 = fio_uint_to_kmg(o->min_bs[DDIR_READ]); - c2 = fio_uint_to_kmg(o->max_bs[DDIR_READ]); - c3 = fio_uint_to_kmg(o->min_bs[DDIR_WRITE]); - c4 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]); + c1 = num2str(o->min_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE); + c2 = num2str(o->max_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE); + c3 = num2str(o->min_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE); + c4 = num2str(o->max_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE); if (!o->bs_is_seq_rand) { - c5 = fio_uint_to_kmg(o->min_bs[DDIR_TRIM]); - c6 = fio_uint_to_kmg(o->max_bs[DDIR_TRIM]); + c5 = num2str(o->min_bs[DDIR_TRIM], 4, 1, i2p, N2S_BYTE); + c6 = num2str(o->max_bs[DDIR_TRIM], 4, 1, i2p, N2S_BYTE); } log_info("%s: (g=%d): rw=%s, ", td->o.name, @@ -1544,10 +1602,10 @@ ddir_str(o->td_ddir)); if (o->bs_is_seq_rand) - log_info("bs(seq/rand)=%s-%s/%s-%s, ", + log_info("bs=(R) %s-%s, (W) %s-%s, bs_is_seq_rand, ", c1, c2, c3, c4); else - log_info("bs=%s-%s/%s-%s/%s-%s, ", + log_info("bs=(R) %s-%s, (W) %s-%s, (T) %s-%s, ", c1, c2, c3, c4, c5, c6); log_info("ioengine=%s, iodepth=%u\n", @@ -1573,7 +1631,7 @@ */ numjobs = o->numjobs; while (--numjobs) { - struct thread_data *td_new = get_new_job(0, td, 1, jobname); + struct thread_data *td_new = get_new_job(false, td, true, jobname); if (!td_new) goto err; @@ -1634,11 +1692,11 @@ sprintf(jobname, "%s", o[i] + 5); } if (in_global && !td_parent) - td_parent = get_new_job(1, &def_thread, 0, jobname); + td_parent = get_new_job(true, &def_thread, false, jobname); else if (!in_global && !td) { if (!td_parent) td_parent = &def_thread; - td = get_new_job(0, td_parent, 0, jobname); + td = get_new_job(false, td_parent, false, jobname); } if (in_global) fio_options_parse(td_parent, (char **) &o[i], 1); @@ -1690,7 +1748,7 @@ char *file, int is_buf, int stonewall_flag, int type, int nested, char *name, char ***popts, int *aopts, int *nopts) { - unsigned int global = 0; + bool global = false; char *string; FILE *f; char *p; @@ -1799,7 +1857,7 @@ first_sect = 0; } - td = get_new_job(global, &def_thread, 0, name); + td = get_new_job(global, &def_thread, false, name); if (!td) { ret = 1; break; @@ -2003,6 +2061,11 @@ #endif } +/* + * Following options aren't printed by usage(). + * --append-terse - Equivalent to --output-format=terse, see f6a7df53. + * --latency-log - Deprecated option. + */ static void usage(const char *name) { printf("%s\n", fio_version_string); @@ -2011,15 +2074,15 @@ show_debug_categories(); printf(" --parse-only\t\tParse options only, don't start any IO\n"); printf(" --output\t\tWrite output to file\n"); - printf(" --runtime\t\tRuntime in seconds\n"); printf(" --bandwidth-log\tGenerate aggregate bandwidth logs\n"); printf(" --minimal\t\tMinimal (terse) output\n"); - printf(" --output-format=x\tOutput format (terse,json,json+,normal)\n"); - printf(" --terse-version=x\tSet terse version output format to 'x'\n"); + printf(" --output-format=type\tOutput format (terse,json,json+,normal)\n"); + printf(" --terse-version=type\tSet terse version output format" + " (default 3, or 2 or 4)\n"); printf(" --version\t\tPrint version info and exit\n"); printf(" --help\t\tPrint this page\n"); printf(" --cpuclock-test\tPerform test/validation of CPU clock\n"); - printf(" --crctest\t\tTest speed of checksum functions\n"); + printf(" --crctest=[type]\tTest speed of checksum functions\n"); printf(" --cmdhelp=cmd\t\tPrint command help, \"all\" for all of" " them\n"); printf(" --enghelp=engine\tPrint ioengine help, or list" @@ -2035,14 +2098,15 @@ printf(" 't' period passed\n"); printf(" --readonly\t\tTurn on safety read-only checks, preventing" " writes\n"); - printf(" --section=name\tOnly run specified section in job file\n"); + printf(" --section=name\tOnly run specified section in job file," + " multiple sections can be specified\n"); printf(" --alloc-size=kb\tSet smalloc pool to this size in kb" - " (def 1024)\n"); + " (def 16384)\n"); printf(" --warnings-fatal\tFio parser warnings are fatal\n"); printf(" --max-jobs=nr\t\tMaximum number of threads/processes to support\n"); printf(" --server=args\t\tStart a backend fio server\n"); printf(" --daemonize=pidfile\tBackground fio server, write pid to file\n"); - printf(" --client=hostname\tTalk to remote backend fio server at hostname\n"); + printf(" --client=hostname\tTalk to remote backend(s) fio server at hostname\n"); printf(" --remote-config=file\tTell fio server to load this local job file\n"); printf(" --idle-prof=option\tReport cpu idleness on a system or percpu basis\n" "\t\t\t(option=system,percpu) or run unit work\n" @@ -2051,7 +2115,7 @@ printf(" --inflate-log=log\tInflate and output compressed log\n"); #endif printf(" --trigger-file=file\tExecute trigger cmd when file exists\n"); - printf(" --trigger-timeout=t\tExecute trigger af this time\n"); + printf(" --trigger-timeout=t\tExecute trigger at this time\n"); printf(" --trigger=cmd\t\tSet this command as local trigger\n"); printf(" --trigger-remote=cmd\tSet this command as remote trigger\n"); printf(" --aux-path=path\tUse this path for fio state generated files\n"); @@ -2325,13 +2389,6 @@ smalloc_pool_size <<= 10; sinit(); break; - case 't': - if (check_str_time(optarg, &def_timeout, 1)) { - log_err("fio: failed parsing time %s\n", optarg); - do_exit++; - exit_val = 1; - } - break; case 'l': log_err("fio: --latency-log is deprecated. Use per-job latency log options.\n"); do_exit++; @@ -2340,17 +2397,22 @@ case 'b': write_bw_log = 1; break; - case 'o': + case 'o': { + FILE *tmp; + if (f_out && f_out != stdout) fclose(f_out); - f_out = fopen(optarg, "w+"); - if (!f_out) { - perror("fopen output"); - exit(1); + tmp = fopen(optarg, "w+"); + if (!tmp) { + log_err("fio: output file open error: %s\n", strerror(errno)); + exit_val = 1; + do_exit++; + break; } - f_err = f_out; + f_err = f_out = tmp; break; + } case 'm': output_format = FIO_OUTPUT_TERSE; break; @@ -2402,8 +2464,7 @@ break; case 'V': terse_version = atoi(optarg); - if (!(terse_version == 2 || terse_version == 3 || - terse_version == 4)) { + if (!(terse_version >= 2 && terse_version <= 5)) { log_err("fio: bad terse version format\n"); exit_val = 1; do_exit++; @@ -2484,7 +2545,7 @@ if (is_section && skip_this_section(val)) continue; - td = get_new_job(global, &def_thread, 1, NULL); + td = get_new_job(global, &def_thread, true, NULL); if (!td || ioengine_load(td)) { if (td) { put_job(td); @@ -2514,7 +2575,6 @@ } if (!ret && !strcmp(opt, "ioengine")) { - free_ioengine(td); if (ioengine_load(td)) { put_job(td); td = NULL; @@ -2722,7 +2782,7 @@ if (!ret) { ret = add_job(td, td->o.name ?: "fio", 0, 0, client_type); if (ret) - did_arg = 1; + exit(1); } } @@ -2734,9 +2794,6 @@ } out_free: - if (pid_file) - free(pid_file); - return ini_idx; } @@ -2805,7 +2862,7 @@ if (did_arg) return 0; - log_err("No jobs(s) defined\n\n"); + log_err("No job(s) defined\n\n"); if (!did_arg) { usage(argv[0]); diff -Nru fio-2.16/io_ddir.h fio-3.1/io_ddir.h --- fio-2.16/io_ddir.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/io_ddir.h 2017-09-28 10:23:20.000000000 +0000 @@ -61,9 +61,9 @@ static inline const char *ddir_str(enum td_ddir ddir) { - static const char *__str[] = { NULL, "read", "write", "rw", NULL, + static const char *__str[] = { NULL, "read", "write", "rw", "rand", "randread", "randwrite", "randrw", - "trim", NULL, NULL, NULL, "randtrim" }; + "trim", NULL, "trimwrite", NULL, "randtrim" }; return __str[ddir]; } diff -Nru fio-2.16/ioengine.h fio-3.1/ioengine.h --- fio-2.16/ioengine.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/ioengine.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,265 +0,0 @@ -#ifndef FIO_IOENGINE_H -#define FIO_IOENGINE_H - -#include "compiler/compiler.h" -#include "os/os.h" -#include "log.h" -#include "io_ddir.h" -#include "debug.h" -#include "file.h" -#include "workqueue.h" - -#ifdef CONFIG_LIBAIO -#include -#endif -#ifdef CONFIG_GUASI -#include -#endif - -#define FIO_IOOPS_VERSION 23 - -enum { - IO_U_F_FREE = 1 << 0, - IO_U_F_FLIGHT = 1 << 1, - IO_U_F_NO_FILE_PUT = 1 << 2, - IO_U_F_IN_CUR_DEPTH = 1 << 3, - IO_U_F_BUSY_OK = 1 << 4, - IO_U_F_TRIMMED = 1 << 5, - IO_U_F_BARRIER = 1 << 6, - IO_U_F_VER_LIST = 1 << 7, -}; - -/* - * The io unit - */ -struct io_u { - struct timeval start_time; - struct timeval issue_time; - - struct fio_file *file; - unsigned int flags; - enum fio_ddir ddir; - - /* - * For replay workloads, we may want to account as a different - * IO type than what is being submitted. - */ - enum fio_ddir acct_ddir; - - /* - * Write generation - */ - unsigned short numberio; - - /* - * Allocated/set buffer and length - */ - unsigned long buflen; - unsigned long long offset; - void *buf; - - /* - * Initial seed for generating the buffer contents - */ - uint64_t rand_seed; - - /* - * IO engine state, may be different from above when we get - * partial transfers / residual data counts - */ - void *xfer_buf; - unsigned long xfer_buflen; - - /* - * Parameter related to pre-filled buffers and - * their size to handle variable block sizes. - */ - unsigned long buf_filled_len; - - struct io_piece *ipo; - - unsigned int resid; - unsigned int error; - - /* - * io engine private data - */ - union { - unsigned int index; - unsigned int seen; - void *engine_data; - }; - - union { - struct flist_head verify_list; - struct workqueue_work work; - }; - - /* - * Callback for io completion - */ - int (*end_io)(struct thread_data *, struct io_u **); - - union { -#ifdef CONFIG_LIBAIO - struct iocb iocb; -#endif -#ifdef CONFIG_POSIXAIO - os_aiocb_t aiocb; -#endif -#ifdef FIO_HAVE_SGIO - struct sg_io_hdr hdr; -#endif -#ifdef CONFIG_GUASI - guasi_req_t greq; -#endif -#ifdef CONFIG_SOLARISAIO - aio_result_t resultp; -#endif -#ifdef FIO_HAVE_BINJECT - struct b_user_cmd buc; -#endif -#ifdef CONFIG_RDMA - struct ibv_mr *mr; -#endif - void *mmap_data; - uint64_t null; - }; -}; - -/* - * io_ops->queue() return values - */ -enum { - FIO_Q_COMPLETED = 0, /* completed sync */ - FIO_Q_QUEUED = 1, /* queued, will complete async */ - FIO_Q_BUSY = 2, /* no more room, call ->commit() */ -}; - -struct ioengine_ops { - struct flist_head list; - const char *name; - int version; - int flags; - int (*setup)(struct thread_data *); - int (*init)(struct thread_data *); - int (*prep)(struct thread_data *, struct io_u *); - int (*queue)(struct thread_data *, struct io_u *); - int (*commit)(struct thread_data *); - int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *); - struct io_u *(*event)(struct thread_data *, int); - char *(*errdetails)(struct io_u *); - int (*cancel)(struct thread_data *, struct io_u *); - void (*cleanup)(struct thread_data *); - int (*open_file)(struct thread_data *, struct fio_file *); - int (*close_file)(struct thread_data *, struct fio_file *); - int (*invalidate)(struct thread_data *, struct fio_file *); - int (*unlink_file)(struct thread_data *, struct fio_file *); - int (*get_file_size)(struct thread_data *, struct fio_file *); - void (*terminate)(struct thread_data *); - int (*iomem_alloc)(struct thread_data *, size_t); - void (*iomem_free)(struct thread_data *); - int (*io_u_init)(struct thread_data *, struct io_u *); - void (*io_u_free)(struct thread_data *, struct io_u *); - int option_struct_size; - struct fio_option *options; -}; - -enum fio_ioengine_flags { - FIO_SYNCIO = 1 << 0, /* io engine has synchronous ->queue */ - FIO_RAWIO = 1 << 1, /* some sort of direct/raw io */ - FIO_DISKLESSIO = 1 << 2, /* no disk involved */ - FIO_NOEXTEND = 1 << 3, /* engine can't extend file */ - FIO_NODISKUTIL = 1 << 4, /* diskutil can't handle filename */ - FIO_UNIDIR = 1 << 5, /* engine is uni-directional */ - FIO_NOIO = 1 << 6, /* thread does only pseudo IO */ - FIO_PIPEIO = 1 << 7, /* input/output no seekable */ - FIO_BARRIER = 1 << 8, /* engine supports barriers */ - FIO_MEMALIGN = 1 << 9, /* engine wants aligned memory */ - FIO_BIT_BASED = 1 << 10, /* engine uses a bit base (e.g. uses Kbit as opposed to KB) */ - FIO_FAKEIO = 1 << 11, /* engine pretends to do IO */ -}; - -/* - * External engine defined symbol to fill in the engine ops structure - */ -typedef void (*get_ioengine_t)(struct ioengine_ops **); - -/* - * io engine entry points - */ -extern int __must_check td_io_init(struct thread_data *); -extern int __must_check td_io_prep(struct thread_data *, struct io_u *); -extern int __must_check td_io_queue(struct thread_data *, struct io_u *); -extern int __must_check td_io_sync(struct thread_data *, struct fio_file *); -extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, const struct timespec *); -extern int __must_check td_io_commit(struct thread_data *); -extern int __must_check td_io_open_file(struct thread_data *, struct fio_file *); -extern int td_io_close_file(struct thread_data *, struct fio_file *); -extern int td_io_unlink_file(struct thread_data *, struct fio_file *); -extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *); - -extern struct ioengine_ops *load_ioengine(struct thread_data *, const char *); -extern void register_ioengine(struct ioengine_ops *); -extern void unregister_ioengine(struct ioengine_ops *); -extern void free_ioengine(struct thread_data *); -extern void close_ioengine(struct thread_data *); - -extern int fio_show_ioengine_help(const char *engine); - -/* - * io unit handling - */ -extern struct io_u *__get_io_u(struct thread_data *); -extern struct io_u *get_io_u(struct thread_data *); -extern void put_io_u(struct thread_data *, struct io_u *); -extern void clear_io_u(struct thread_data *, struct io_u *); -extern void requeue_io_u(struct thread_data *, struct io_u **); -extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *); -extern int __must_check io_u_queued_complete(struct thread_data *, int); -extern void io_u_queued(struct thread_data *, struct io_u *); -extern int io_u_quiesce(struct thread_data *); -extern void io_u_log_error(struct thread_data *, struct io_u *); -extern void io_u_mark_depth(struct thread_data *, unsigned int); -extern void fill_io_buffer(struct thread_data *, void *, unsigned int, unsigned int); -extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int); -void io_u_mark_complete(struct thread_data *, unsigned int); -void io_u_mark_submit(struct thread_data *, unsigned int); -bool queue_full(const struct thread_data *); - -int do_io_u_sync(const struct thread_data *, struct io_u *); -int do_io_u_trim(const struct thread_data *, struct io_u *); - -#ifdef FIO_INC_DEBUG -static inline void dprint_io_u(struct io_u *io_u, const char *p) -{ - struct fio_file *f = io_u->file; - - dprint(FD_IO, "%s: io_u %p: off=%llu/len=%lu/ddir=%d", p, io_u, - (unsigned long long) io_u->offset, - io_u->buflen, io_u->ddir); - if (fio_debug & (1 << FD_IO)) { - if (f) - log_info("/%s", f->file_name); - - log_info("\n"); - } -} -#else -#define dprint_io_u(io_u, p) -#endif - -static inline enum fio_ddir acct_ddir(struct io_u *io_u) -{ - if (io_u->acct_ddir != -1) - return io_u->acct_ddir; - - return io_u->ddir; -} - -#define io_u_clear(td, io_u, val) \ - td_flags_clear((td), &(io_u->flags), (val)) -#define io_u_set(td, io_u, val) \ - td_flags_set((td), &(io_u)->flags, (val)) - -#endif diff -Nru fio-2.16/ioengines.c fio-3.1/ioengines.c --- fio-2.16/ioengines.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/ioengines.c 2017-09-28 10:23:20.000000000 +0000 @@ -123,13 +123,10 @@ return ops; } -struct ioengine_ops *load_ioengine(struct thread_data *td, const char *name) +static struct ioengine_ops *__load_ioengine(const char *name) { - struct ioengine_ops *ops; char engine[64]; - dprint(FD_IO, "load ioengine %s\n", name); - engine[sizeof(engine) - 1] = '\0'; strncpy(engine, name, sizeof(engine) - 1); @@ -139,10 +136,37 @@ if (!strncmp(engine, "linuxaio", 8) || !strncmp(engine, "aio", 3)) strcpy(engine, "libaio"); - ops = find_ioengine(engine); + dprint(FD_IO, "load ioengine %s\n", engine); + return find_ioengine(engine); +} + +struct ioengine_ops *load_ioengine(struct thread_data *td) +{ + struct ioengine_ops *ops = NULL; + const char *name; + + /* + * Use ->ioengine_so_path if an external ioengine path is specified. + * In this case, ->ioengine is "external" which also means the prefix + * for external ioengines "external:" is properly used. + */ + name = td->o.ioengine_so_path ?: td->o.ioengine; + + /* + * Try to load ->ioengine first, and if failed try to dlopen(3) either + * ->ioengine or ->ioengine_so_path. This is redundant for an external + * ioengine with prefix, and also leaves the possibility of unexpected + * behavior (e.g. if the "external" ioengine exists), but we do this + * so as not to break job files not using the prefix. + */ + ops = __load_ioengine(td->o.ioengine); if (!ops) ops = dlopen_ioengine(td, name); + /* + * If ops is NULL, we failed to load ->ioengine, and also failed to + * dlopen(3) either ->ioengine or ->ioengine_so_path as a path. + */ if (!ops) { log_err("fio: engine %s not loadable\n", name); return NULL; @@ -281,7 +305,7 @@ */ if (td->o.read_iolog_file) memcpy(&td->last_issue, &io_u->issue_time, - sizeof(struct timeval)); + sizeof(io_u->issue_time)); } if (ddir_rw(ddir)) { @@ -318,8 +342,8 @@ td->o.odirect) { log_info("fio: first direct IO errored. File system may not " - "support direct IO, or iomem_align= is bad. Try " - "setting direct=0.\n"); + "support direct IO, or iomem_align= is bad, or " + "invalid block size. Try setting direct=0.\n"); } if (!td->io_ops->commit || io_u->ddir == DDIR_TRIM) { @@ -356,7 +380,7 @@ */ if (td->o.read_iolog_file) memcpy(&td->last_issue, &io_u->issue_time, - sizeof(struct timeval)); + sizeof(io_u->issue_time)); } return ret; @@ -368,17 +392,17 @@ if (td->io_ops->init) { ret = td->io_ops->init(td); - if (ret && td->o.iodepth > 1) { - log_err("fio: io engine init failed. Perhaps try" - " reducing io depth?\n"); - } + if (ret) + log_err("fio: io engine %s init failed.%s\n", + td->io_ops->name, + td->o.iodepth > 1 ? + " Perhaps try reducing io depth?" : ""); + else + td->io_ops_init = 1; if (!td->error) td->error = ret; } - if (!ret && td_ioengine_flagged(td, FIO_NOIO)) - td->flags |= TD_F_NOIO; - return ret; } @@ -449,7 +473,7 @@ goto err; if (td->o.fadvise_hint != F_ADV_NONE && - (f->filetype == FIO_TYPE_BD || f->filetype == FIO_TYPE_FILE)) { + (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) { int flags; if (td->o.fadvise_hint == F_ADV_TYPE) { @@ -472,39 +496,32 @@ goto err; } } -#ifdef FIO_HAVE_STREAMID - if (td->o.fadvise_stream && - (f->filetype == FIO_TYPE_BD || f->filetype == FIO_TYPE_FILE)) { - off_t stream = td->o.fadvise_stream; - - if (posix_fadvise(f->fd, stream, f->io_size, POSIX_FADV_STREAMID) < 0) { - td_verror(td, errno, "fadvise streamid"); - goto err; - } - } -#endif - -#ifdef FIO_OS_DIRECTIO - /* - * Some OS's have a distinct call to mark the file non-buffered, - * instead of using O_DIRECT (Solaris) - */ - if (td->o.odirect) { - int ret = fio_set_odirect(f->fd); +#ifdef FIO_HAVE_WRITE_HINT + if (fio_option_is_set(&td->o, write_hint) && + (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) { + uint64_t hint = td->o.write_hint; + int cmd; - if (ret) { - td_verror(td, ret, "fio_set_odirect"); - if (ret == ENOTTY) { /* ENOTTY suggests RAW device or ZFS */ - log_err("fio: doing directIO to RAW devices or ZFS not supported\n"); - } else { - log_err("fio: the file system does not seem to support direct IO\n"); - } + /* + * For direct IO, we just need/want to set the hint on + * the file descriptor. For buffered IO, we need to set + * it on the inode. + */ + if (td->o.odirect) + cmd = F_SET_FILE_RW_HINT; + else + cmd = F_SET_RW_HINT; + if (fcntl(f->fd, cmd, &hint) < 0) { + td_verror(td, errno, "fcntl write hint"); goto err; } } #endif + if (td->o.odirect && !OS_O_DIRECT && fio_set_directio(td, f)) + goto err; + done: log_file(td, f, FIO_LOG_OPEN_FILE); return 0; @@ -556,77 +573,18 @@ return td->io_ops->get_file_size(td, f); } -static int do_sync_file_range(const struct thread_data *td, - struct fio_file *f) -{ - off64_t offset, nbytes; - - offset = f->first_write; - nbytes = f->last_write - f->first_write; - - if (!nbytes) - return 0; - - return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range); -} - -int do_io_u_sync(const struct thread_data *td, struct io_u *io_u) -{ - int ret; - - if (io_u->ddir == DDIR_SYNC) { - ret = fsync(io_u->file->fd); - } else if (io_u->ddir == DDIR_DATASYNC) { -#ifdef CONFIG_FDATASYNC - ret = fdatasync(io_u->file->fd); -#else - ret = io_u->xfer_buflen; - io_u->error = EINVAL; -#endif - } else if (io_u->ddir == DDIR_SYNC_FILE_RANGE) - ret = do_sync_file_range(td, io_u->file); - else { - ret = io_u->xfer_buflen; - io_u->error = EINVAL; - } - - if (ret < 0) - io_u->error = errno; - - return ret; -} - -int do_io_u_trim(const struct thread_data *td, struct io_u *io_u) -{ -#ifndef FIO_HAVE_TRIM - io_u->error = EINVAL; - return 0; -#else - struct fio_file *f = io_u->file; - int ret; - - ret = os_trim(f->fd, io_u->offset, io_u->xfer_buflen); - if (!ret) - return io_u->xfer_buflen; - - io_u->error = ret; - return 0; -#endif -} - int fio_show_ioengine_help(const char *engine) { struct flist_head *entry; - struct thread_data td; + struct ioengine_ops *io_ops; char *sep; int ret = 1; if (!engine || !*engine) { log_info("Available IO engines:\n"); flist_for_each(entry, &engine_list) { - td.io_ops = flist_entry(entry, struct ioengine_ops, - list); - log_info("\t%s\n", td.io_ops->name); + io_ops = flist_entry(entry, struct ioengine_ops, list); + log_info("\t%s\n", io_ops->name); } return 0; } @@ -636,20 +594,16 @@ sep++; } - memset(&td, 0, sizeof(td)); - - td.io_ops = load_ioengine(&td, engine); - if (!td.io_ops) { + io_ops = __load_ioengine(engine); + if (!io_ops) { log_info("IO engine %s not found\n", engine); return 1; } - if (td.io_ops->options) - ret = show_cmd_help(td.io_ops->options, sep); + if (io_ops->options) + ret = show_cmd_help(io_ops->options, sep); else - log_info("IO engine %s has no options\n", td.io_ops->name); - - free_ioengine(&td); + log_info("IO engine %s has no options\n", io_ops->name); return ret; } diff -Nru fio-2.16/ioengines.h fio-3.1/ioengines.h --- fio-2.16/ioengines.h 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/ioengines.h 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,90 @@ +#ifndef FIO_IOENGINE_H +#define FIO_IOENGINE_H + +#include "compiler/compiler.h" +#include "os/os.h" +#include "file.h" +#include "io_u.h" + +#define FIO_IOOPS_VERSION 23 + +/* + * io_ops->queue() return values + */ +enum { + FIO_Q_COMPLETED = 0, /* completed sync */ + FIO_Q_QUEUED = 1, /* queued, will complete async */ + FIO_Q_BUSY = 2, /* no more room, call ->commit() */ +}; + +struct ioengine_ops { + struct flist_head list; + const char *name; + int version; + int flags; + int (*setup)(struct thread_data *); + int (*init)(struct thread_data *); + int (*prep)(struct thread_data *, struct io_u *); + int (*queue)(struct thread_data *, struct io_u *); + int (*commit)(struct thread_data *); + int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *); + struct io_u *(*event)(struct thread_data *, int); + char *(*errdetails)(struct io_u *); + int (*cancel)(struct thread_data *, struct io_u *); + void (*cleanup)(struct thread_data *); + int (*open_file)(struct thread_data *, struct fio_file *); + int (*close_file)(struct thread_data *, struct fio_file *); + int (*invalidate)(struct thread_data *, struct fio_file *); + int (*unlink_file)(struct thread_data *, struct fio_file *); + int (*get_file_size)(struct thread_data *, struct fio_file *); + void (*terminate)(struct thread_data *); + int (*iomem_alloc)(struct thread_data *, size_t); + void (*iomem_free)(struct thread_data *); + int (*io_u_init)(struct thread_data *, struct io_u *); + void (*io_u_free)(struct thread_data *, struct io_u *); + int option_struct_size; + struct fio_option *options; +}; + +enum fio_ioengine_flags { + FIO_SYNCIO = 1 << 0, /* io engine has synchronous ->queue */ + FIO_RAWIO = 1 << 1, /* some sort of direct/raw io */ + FIO_DISKLESSIO = 1 << 2, /* no disk involved */ + FIO_NOEXTEND = 1 << 3, /* engine can't extend file */ + FIO_NODISKUTIL = 1 << 4, /* diskutil can't handle filename */ + FIO_UNIDIR = 1 << 5, /* engine is uni-directional */ + FIO_NOIO = 1 << 6, /* thread does only pseudo IO */ + FIO_PIPEIO = 1 << 7, /* input/output no seekable */ + FIO_BARRIER = 1 << 8, /* engine supports barriers */ + FIO_MEMALIGN = 1 << 9, /* engine wants aligned memory */ + FIO_BIT_BASED = 1 << 10, /* engine uses a bit base (e.g. uses Kbit as opposed to KB) */ + FIO_FAKEIO = 1 << 11, /* engine pretends to do IO */ +}; + +/* + * External engine defined symbol to fill in the engine ops structure + */ +typedef void (*get_ioengine_t)(struct ioengine_ops **); + +/* + * io engine entry points + */ +extern int __must_check td_io_init(struct thread_data *); +extern int __must_check td_io_prep(struct thread_data *, struct io_u *); +extern int __must_check td_io_queue(struct thread_data *, struct io_u *); +extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, const struct timespec *); +extern int __must_check td_io_commit(struct thread_data *); +extern int __must_check td_io_open_file(struct thread_data *, struct fio_file *); +extern int td_io_close_file(struct thread_data *, struct fio_file *); +extern int td_io_unlink_file(struct thread_data *, struct fio_file *); +extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *); + +extern struct ioengine_ops *load_ioengine(struct thread_data *); +extern void register_ioengine(struct ioengine_ops *); +extern void unregister_ioengine(struct ioengine_ops *); +extern void free_ioengine(struct thread_data *); +extern void close_ioengine(struct thread_data *); + +extern int fio_show_ioengine_help(const char *engine); + +#endif diff -Nru fio-2.16/iolog.c fio-3.1/iolog.c --- fio-2.16/iolog.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/iolog.c 2017-09-28 10:23:20.000000000 +0000 @@ -19,6 +19,7 @@ #include "trim.h" #include "filelock.h" #include "smalloc.h" +#include "blktrace.h" static int iolog_flush(struct io_log *log); @@ -64,7 +65,7 @@ { uint64_t usec = utime_since_now(&td->last_issue); uint64_t this_delay; - struct timeval tv; + struct timespec ts; if (delay < td->time_offset) { td->time_offset = 0; @@ -77,7 +78,7 @@ delay -= usec; - fio_gettime(&tv, NULL); + fio_gettime(&ts, NULL); while (delay && !td->terminate) { this_delay = delay; if (this_delay > 500000) @@ -87,7 +88,7 @@ delay -= this_delay; } - usec = utime_since_now(&tv); + usec = utime_since_now(&ts); if (usec > delay) td->time_offset = usec - delay; else @@ -226,21 +227,16 @@ } /* - * We don't need to sort the entries, if: + * We don't need to sort the entries if we only performed sequential + * writes. In this case, just reading back data in the order we wrote + * it out is the faster but still safe. * - * Sequential writes, or - * Random writes that lay out the file as it goes along - * - * For both these cases, just reading back data in the order we - * wrote it out is the fastest. - * - * One exception is if we don't have a random map AND we are doing - * verifies, in that case we need to check for duplicate blocks and - * drop the old one, which we rely on the rb insert/lookup for - * handling. + * One exception is if we don't have a random map in which case we need + * to check for duplicate blocks and drop the old one, which we rely on + * the rb insert/lookup for handling. */ - if (((!td->o.verifysort) || !td_random(td) || !td->o.overwrite) && - (file_randommap(td, ipo->file) || td->o.verify == VERIFY_NONE)) { + if (((!td->o.verifysort) || !td_random(td)) && + file_randommap(td, ipo->file)) { INIT_FLIST_HEAD(&ipo->list); flist_add_tail(&ipo->list, &td->io_hist_list); ipo->flags |= IP_F_ONLIST; @@ -277,13 +273,14 @@ overlap = 1; if (overlap) { - dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu", + dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu\n", __ipo->offset, __ipo->len, ipo->offset, ipo->len); td->io_hist_len--; rb_erase(parent, &td->io_hist_tree); remove_trim_entry(td, __ipo); - free(__ipo); + if (!(__ipo->flags & IP_F_IN_FLIGHT)) + free(__ipo); goto restart; } } @@ -422,7 +419,7 @@ continue; } } else { - log_err("bad iolog2: %s", p); + log_err("bad iolog2: %s\n", p); continue; } @@ -642,6 +639,7 @@ l->log_gz = 0; else if (l->log_gz || l->log_gz_store) { mutex_init_pshared(&l->chunk_lock); + mutex_init_pshared(&l->deferred_free_lock); p->td->flags |= TD_F_COMPRESS_LOG; } @@ -696,7 +694,7 @@ sfree(log); } -inline unsigned long hist_sum(int j, int stride, unsigned int *io_u_plat, +unsigned long hist_sum(int j, int stride, unsigned int *io_u_plat, unsigned int *io_u_plat_last) { unsigned long sum; @@ -1143,6 +1141,42 @@ #ifdef CONFIG_ZLIB +static bool warned_on_drop; + +static void iolog_put_deferred(struct io_log *log, void *ptr) +{ + if (!ptr) + return; + + pthread_mutex_lock(&log->deferred_free_lock); + if (log->deferred < IOLOG_MAX_DEFER) { + log->deferred_items[log->deferred] = ptr; + log->deferred++; + } else if (!warned_on_drop) { + log_err("fio: had to drop log entry free\n"); + warned_on_drop = true; + } + pthread_mutex_unlock(&log->deferred_free_lock); +} + +static void iolog_free_deferred(struct io_log *log) +{ + int i; + + if (!log->deferred) + return; + + pthread_mutex_lock(&log->deferred_free_lock); + + for (i = 0; i < log->deferred; i++) { + free(log->deferred_items[i]); + log->deferred_items[i] = NULL; + } + + log->deferred = 0; + pthread_mutex_unlock(&log->deferred_free_lock); +} + static int gz_work(struct iolog_flush_data *data) { struct iolog_compress *c = NULL; @@ -1235,7 +1269,7 @@ if (ret != Z_OK) log_err("fio: deflateEnd %d\n", ret); - free(data->samples); + iolog_put_deferred(data->log, data->samples); if (!flist_empty(&list)) { pthread_mutex_lock(&data->log->chunk_lock); @@ -1246,7 +1280,7 @@ ret = 0; done: if (data->free) - free(data); + sfree(data); return ret; err: while (!flist_empty(&list)) { @@ -1347,7 +1381,7 @@ { struct iolog_flush_data *data; - data = malloc(sizeof(*data)); + data = smalloc(sizeof(*data)); if (!data) return 1; @@ -1361,6 +1395,9 @@ cur_log->log = NULL; workqueue_enqueue(&log->td->log_compress_wq, &data->work); + + iolog_free_deferred(log); + return 0; } #else diff -Nru fio-2.16/iolog.h fio-3.1/iolog.h --- fio-2.16/iolog.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/iolog.h 2017-09-28 10:23:20.000000000 +0000 @@ -4,7 +4,7 @@ #include "lib/rbtree.h" #include "lib/ieee754.h" #include "flist.h" -#include "ioengine.h" +#include "ioengines.h" /* * Use for maintaining statistics @@ -117,7 +117,7 @@ */ struct io_stat avg_window[DDIR_RWDIR_CNT]; unsigned long avg_msec; - unsigned long avg_last; + unsigned long avg_last[DDIR_RWDIR_CNT]; /* * Windowed latency histograms, for keeping track of when we need to @@ -131,6 +131,11 @@ pthread_mutex_t chunk_lock; unsigned int chunk_seq; struct flist_head chunk_list; + + pthread_mutex_t deferred_free_lock; +#define IOLOG_MAX_DEFER 8 + void *deferred_items[IOLOG_MAX_DEFER]; + unsigned int deferred; }; /* @@ -259,7 +264,7 @@ static inline bool per_unit_log(struct io_log *log) { - return log && !log->avg_msec; + return log && (!log->avg_msec || log->log_gz || log->log_gz_store); } static inline bool inline_log(struct io_log *log) @@ -271,7 +276,7 @@ static inline void ipo_bytes_align(unsigned int replay_align, struct io_piece *ipo) { - if (replay_align) + if (!replay_align) return; ipo->offset &= ~(replay_align - (uint64_t)1); diff -Nru fio-2.16/io_u.c fio-3.1/io_u.c --- fio-2.16/io_u.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/io_u.c 2017-09-28 10:23:20.000000000 +0000 @@ -20,7 +20,7 @@ int error; /* output */ uint64_t bytes_done[DDIR_RWDIR_CNT]; /* output */ - struct timeval time; /* output */ + struct timespec time; /* output */ }; /* @@ -37,7 +37,7 @@ */ static void mark_random_map(struct thread_data *td, struct io_u *io_u) { - unsigned int min_bs = td->o.rw_min_bs; + unsigned int min_bs = td->o.min_bs[io_u->ddir]; struct fio_file *f = io_u->file; unsigned int nr_blocks; uint64_t block; @@ -62,6 +62,7 @@ /* * Hmm, should we make sure that ->io_size <= ->real_file_size? + * -> not for now since there is code assuming it could go either. */ max_size = f->io_size; if (max_size > f->real_file_size) @@ -532,6 +533,7 @@ unsigned int buflen = 0; unsigned int minbs, maxbs; uint64_t frand_max, r; + bool power_2; assert(ddir_rw(ddir)); @@ -550,9 +552,9 @@ if (!io_u_fits(td, io_u, minbs)) return 0; - frand_max = rand_max(&td->bsrange_state); + frand_max = rand_max(&td->bsrange_state[ddir]); do { - r = __rand(&td->bsrange_state); + r = __rand(&td->bsrange_state[ddir]); if (!td->o.bssplit_nr[ddir]) { buflen = 1 + (unsigned int) ((double) maxbs * @@ -576,13 +578,11 @@ } } - if (td->o.verify != VERIFY_NONE) - buflen = (buflen + td->o.verify_interval - 1) & - ~(td->o.verify_interval - 1); - - if (!td->o.bs_unaligned && is_power_of_2(minbs)) + power_2 = is_power_of_2(minbs); + if (!td->o.bs_unaligned && power_2) buflen &= ~(minbs - 1); - + else if (!td->o.bs_unaligned && !power_2) + buflen -= buflen % minbs; } while (!io_u_fits(td, io_u, buflen)); return buflen; @@ -646,7 +646,7 @@ } while (td->io_u_in_flight) { - int fio_unused ret; + int ret; ret = io_u_queued_complete(td, 1); if (ret > 0) @@ -717,28 +717,22 @@ enum fio_ddir ddir; /* - * see if it's time to fsync + * See if it's time to fsync/fdatasync/sync_file_range first, + * and if not then move on to check regular I/Os. */ - if (td->o.fsync_blocks && - !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) && - td->io_issues[DDIR_WRITE] && should_fsync(td)) - return DDIR_SYNC; - - /* - * see if it's time to fdatasync - */ - if (td->o.fdatasync_blocks && - !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks) && - td->io_issues[DDIR_WRITE] && should_fsync(td)) - return DDIR_DATASYNC; - - /* - * see if it's time to sync_file_range - */ - if (td->sync_file_range_nr && - !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr) && - td->io_issues[DDIR_WRITE] && should_fsync(td)) - return DDIR_SYNC_FILE_RANGE; + if (should_fsync(td)) { + if (td->o.fsync_blocks && td->io_issues[DDIR_WRITE] && + !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks)) + return DDIR_SYNC; + + if (td->o.fdatasync_blocks && td->io_issues[DDIR_WRITE] && + !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks)) + return DDIR_DATASYNC; + + if (td->sync_file_range_nr && td->io_issues[DDIR_WRITE] && + !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr)) + return DDIR_SYNC_FILE_RANGE; + } if (td_rw(td)) { /* @@ -762,8 +756,10 @@ ddir = DDIR_READ; else if (td_write(td)) ddir = DDIR_WRITE; - else + else if (td_trim(td)) ddir = DDIR_TRIM; + else + ddir = DDIR_INVAL; td->rwmix_ddir = rate_ddir(td, ddir); return td->rwmix_ddir; @@ -903,8 +899,9 @@ } if (io_u->offset + io_u->buflen > io_u->file->real_file_size) { - dprint(FD_IO, "io_u %p, offset too large\n", io_u); - dprint(FD_IO, " off=%llu/%lu > %llu\n", + dprint(FD_IO, "io_u %p, offset + buflen exceeds file size\n", + io_u); + dprint(FD_IO, " offset=%llu/buflen=%lu > %llu\n", (unsigned long long) io_u->offset, io_u->buflen, (unsigned long long) io_u->file->real_file_size); return 1; @@ -992,11 +989,52 @@ td->ts.io_u_map[idx] += nr; } -static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec) +static void io_u_mark_lat_nsec(struct thread_data *td, unsigned long long nsec) +{ + int idx = 0; + + assert(nsec < 1000); + + switch (nsec) { + case 750 ... 999: + idx = 9; + break; + case 500 ... 749: + idx = 8; + break; + case 250 ... 499: + idx = 7; + break; + case 100 ... 249: + idx = 6; + break; + case 50 ... 99: + idx = 5; + break; + case 20 ... 49: + idx = 4; + break; + case 10 ... 19: + idx = 3; + break; + case 4 ... 9: + idx = 2; + break; + case 2 ... 3: + idx = 1; + case 0 ... 1: + break; + } + + assert(idx < FIO_IO_U_LAT_N_NR); + td->ts.io_u_lat_n[idx]++; +} + +static void io_u_mark_lat_usec(struct thread_data *td, unsigned long long usec) { int idx = 0; - assert(usec < 1000); + assert(usec < 1000 && usec >= 1); switch (usec) { case 750 ... 999: @@ -1033,10 +1071,12 @@ td->ts.io_u_lat_u[idx]++; } -static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec) +static void io_u_mark_lat_msec(struct thread_data *td, unsigned long long msec) { int idx = 0; + assert(msec >= 1); + switch (msec) { default: idx = 11; @@ -1078,12 +1118,14 @@ td->ts.io_u_lat_m[idx]++; } -static void io_u_mark_latency(struct thread_data *td, unsigned long usec) +static void io_u_mark_latency(struct thread_data *td, unsigned long long nsec) { - if (usec < 1000) - io_u_mark_lat_usec(td, usec); + if (nsec < 1000) + io_u_mark_lat_nsec(td, nsec); + else if (nsec < 1000000) + io_u_mark_lat_usec(td, nsec / 1000); else - io_u_mark_lat_msec(td, usec / 1000); + io_u_mark_lat_msec(td, nsec / 1000000); } static unsigned int __get_next_fileno_rand(struct thread_data *td) @@ -1560,7 +1602,7 @@ unsigned int i, nr_blocks = io_u->buflen / 512; uint64_t boffset; unsigned int offset; - void *p, *end; + char *p, *end; if (!nr_blocks) return; @@ -1575,7 +1617,7 @@ * the buffer, given by the product of the usec time * and the actual offset. */ - offset = (io_u->start_time.tv_usec ^ boffset) & 511; + offset = ((io_u->start_time.tv_nsec/1000) ^ boffset) & 511; offset &= ~(sizeof(uint64_t) - 1); if (offset >= 512 - sizeof(uint64_t)) offset -= sizeof(uint64_t); @@ -1677,8 +1719,10 @@ if (!td_io_prep(td, io_u)) { if (!td->o.disable_lat) fio_gettime(&io_u->start_time, NULL); + if (do_scramble) small_content_scramble(io_u); + return io_u; } err_put: @@ -1730,43 +1774,46 @@ const enum fio_ddir idx, unsigned int bytes) { const int no_reduce = !gtod_reduce(td); - unsigned long lusec = 0; + unsigned long long llnsec = 0; if (td->parent) td = td->parent; + if (!td->o.stats) + return; + if (no_reduce) - lusec = utime_since(&io_u->issue_time, &icd->time); + llnsec = ntime_since(&io_u->issue_time, &icd->time); if (!td->o.disable_lat) { - unsigned long tusec; + unsigned long long tnsec; - tusec = utime_since(&io_u->start_time, &icd->time); - add_lat_sample(td, idx, tusec, bytes, io_u->offset); + tnsec = ntime_since(&io_u->start_time, &icd->time); + add_lat_sample(td, idx, tnsec, bytes, io_u->offset); if (td->flags & TD_F_PROFILE_OPS) { struct prof_io_ops *ops = &td->prof_io_ops; if (ops->io_u_lat) - icd->error = ops->io_u_lat(td, tusec); + icd->error = ops->io_u_lat(td, tnsec/1000); } - if (td->o.max_latency && tusec > td->o.max_latency) - lat_fatal(td, icd, tusec, td->o.max_latency); - if (td->o.latency_target && tusec > td->o.latency_target) { + if (td->o.max_latency && tnsec/1000 > td->o.max_latency) + lat_fatal(td, icd, tnsec/1000, td->o.max_latency); + if (td->o.latency_target && tnsec/1000 > td->o.latency_target) { if (lat_target_failed(td)) - lat_fatal(td, icd, tusec, td->o.latency_target); + lat_fatal(td, icd, tnsec/1000, td->o.latency_target); } } if (ddir_rw(idx)) { if (!td->o.disable_clat) { - add_clat_sample(td, idx, lusec, bytes, io_u->offset); - io_u_mark_latency(td, lusec); + add_clat_sample(td, idx, llnsec, bytes, io_u->offset); + io_u_mark_latency(td, llnsec); } if (!td->o.disable_bw && per_unit_log(td->bw_log)) - add_bw_sample(td, io_u, bytes, lusec); + add_bw_sample(td, io_u, bytes, llnsec); if (no_reduce && per_unit_log(td->iops_log)) add_iops_sample(td, io_u, bytes); @@ -1906,7 +1953,7 @@ icd->nr = nr; icd->error = 0; - for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) icd->bytes_done[ddir] = 0; } @@ -1945,7 +1992,7 @@ return -1; } - for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) td->bytes_done[ddir] += icd.bytes_done[ddir]; return 0; @@ -1961,7 +2008,7 @@ int ret, ddir; struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, }; - dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_evts); + dprint(FD_IO, "io_u_queued_complete: min=%d\n", min_evts); if (!min_evts) tvp = &ts; @@ -1984,7 +2031,7 @@ return -1; } - for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) td->bytes_done[ddir] += icd.bytes_done[ddir]; return ret; @@ -1995,10 +2042,10 @@ */ void io_u_queued(struct thread_data *td, struct io_u *io_u) { - if (!td->o.disable_slat) { + if (!td->o.disable_slat && ramp_time_over(td) && td->o.stats) { unsigned long slat_time; - slat_time = utime_since(&io_u->start_time, &io_u->issue_time); + slat_time = ntime_since(&io_u->start_time, &io_u->issue_time); if (td->parent) td = td->parent; @@ -2043,6 +2090,9 @@ { struct thread_options *o = &td->o; + if (o->mem_type == MEM_CUDA_MALLOC) + return; + if (o->compress_percentage || o->dedupe_percentage) { unsigned int perc = td->o.compress_percentage; struct frand_state *rs; @@ -2088,3 +2138,61 @@ io_u->buf_filled_len = 0; fill_io_buffer(td, io_u->buf, min_write, max_bs); } + +static int do_sync_file_range(const struct thread_data *td, + struct fio_file *f) +{ + off64_t offset, nbytes; + + offset = f->first_write; + nbytes = f->last_write - f->first_write; + + if (!nbytes) + return 0; + + return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range); +} + +int do_io_u_sync(const struct thread_data *td, struct io_u *io_u) +{ + int ret; + + if (io_u->ddir == DDIR_SYNC) { + ret = fsync(io_u->file->fd); + } else if (io_u->ddir == DDIR_DATASYNC) { +#ifdef CONFIG_FDATASYNC + ret = fdatasync(io_u->file->fd); +#else + ret = io_u->xfer_buflen; + io_u->error = EINVAL; +#endif + } else if (io_u->ddir == DDIR_SYNC_FILE_RANGE) + ret = do_sync_file_range(td, io_u->file); + else { + ret = io_u->xfer_buflen; + io_u->error = EINVAL; + } + + if (ret < 0) + io_u->error = errno; + + return ret; +} + +int do_io_u_trim(const struct thread_data *td, struct io_u *io_u) +{ +#ifndef FIO_HAVE_TRIM + io_u->error = EINVAL; + return 0; +#else + struct fio_file *f = io_u->file; + int ret; + + ret = os_trim(f, io_u->offset, io_u->xfer_buflen); + if (!ret) + return io_u->xfer_buflen; + + io_u->error = ret; + return 0; +#endif +} diff -Nru fio-2.16/io_u.h fio-3.1/io_u.h --- fio-2.16/io_u.h 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/io_u.h 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,179 @@ +#ifndef FIO_IO_U +#define FIO_IO_U + +#include "compiler/compiler.h" +#include "os/os.h" +#include "log.h" +#include "io_ddir.h" +#include "debug.h" +#include "file.h" +#include "workqueue.h" + +#ifdef CONFIG_LIBAIO +#include +#endif +#ifdef CONFIG_GUASI +#include +#endif + +enum { + IO_U_F_FREE = 1 << 0, + IO_U_F_FLIGHT = 1 << 1, + IO_U_F_NO_FILE_PUT = 1 << 2, + IO_U_F_IN_CUR_DEPTH = 1 << 3, + IO_U_F_BUSY_OK = 1 << 4, + IO_U_F_TRIMMED = 1 << 5, + IO_U_F_BARRIER = 1 << 6, + IO_U_F_VER_LIST = 1 << 7, +}; + +/* + * The io unit + */ +struct io_u { + struct timespec start_time; + struct timespec issue_time; + + struct fio_file *file; + unsigned int flags; + enum fio_ddir ddir; + + /* + * For replay workloads, we may want to account as a different + * IO type than what is being submitted. + */ + enum fio_ddir acct_ddir; + + /* + * Write generation + */ + unsigned short numberio; + + /* + * Allocated/set buffer and length + */ + unsigned long buflen; + unsigned long long offset; + void *buf; + + /* + * Initial seed for generating the buffer contents + */ + uint64_t rand_seed; + + /* + * IO engine state, may be different from above when we get + * partial transfers / residual data counts + */ + void *xfer_buf; + unsigned long xfer_buflen; + + /* + * Parameter related to pre-filled buffers and + * their size to handle variable block sizes. + */ + unsigned long buf_filled_len; + + struct io_piece *ipo; + + unsigned int resid; + unsigned int error; + + /* + * io engine private data + */ + union { + unsigned int index; + unsigned int seen; + void *engine_data; + }; + + union { + struct flist_head verify_list; + struct workqueue_work work; + }; + + /* + * Callback for io completion + */ + int (*end_io)(struct thread_data *, struct io_u **); + + union { +#ifdef CONFIG_LIBAIO + struct iocb iocb; +#endif +#ifdef CONFIG_POSIXAIO + os_aiocb_t aiocb; +#endif +#ifdef FIO_HAVE_SGIO + struct sg_io_hdr hdr; +#endif +#ifdef CONFIG_GUASI + guasi_req_t greq; +#endif +#ifdef CONFIG_SOLARISAIO + aio_result_t resultp; +#endif +#ifdef FIO_HAVE_BINJECT + struct b_user_cmd buc; +#endif +#ifdef CONFIG_RDMA + struct ibv_mr *mr; +#endif + void *mmap_data; + }; +}; + +/* + * io unit handling + */ +extern struct io_u *__get_io_u(struct thread_data *); +extern struct io_u *get_io_u(struct thread_data *); +extern void put_io_u(struct thread_data *, struct io_u *); +extern void clear_io_u(struct thread_data *, struct io_u *); +extern void requeue_io_u(struct thread_data *, struct io_u **); +extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *); +extern int __must_check io_u_queued_complete(struct thread_data *, int); +extern void io_u_queued(struct thread_data *, struct io_u *); +extern int io_u_quiesce(struct thread_data *); +extern void io_u_log_error(struct thread_data *, struct io_u *); +extern void io_u_mark_depth(struct thread_data *, unsigned int); +extern void fill_io_buffer(struct thread_data *, void *, unsigned int, unsigned int); +extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int); +void io_u_mark_complete(struct thread_data *, unsigned int); +void io_u_mark_submit(struct thread_data *, unsigned int); +bool queue_full(const struct thread_data *); + +int do_io_u_sync(const struct thread_data *, struct io_u *); +int do_io_u_trim(const struct thread_data *, struct io_u *); + +#ifdef FIO_INC_DEBUG +static inline void dprint_io_u(struct io_u *io_u, const char *p) +{ + struct fio_file *f = io_u->file; + + dprint(FD_IO, "%s: io_u %p: off=%llu/len=%lu/ddir=%d", p, io_u, + (unsigned long long) io_u->offset, + io_u->buflen, io_u->ddir); + if (f) + dprint(FD_IO, "/%s", f->file_name); + dprint(FD_IO, "\n"); +} +#else +#define dprint_io_u(io_u, p) +#endif + +static inline enum fio_ddir acct_ddir(struct io_u *io_u) +{ + if (io_u->acct_ddir != -1) + return io_u->acct_ddir; + + return io_u->ddir; +} + +#define io_u_clear(td, io_u, val) \ + td_flags_clear((td), &(io_u->flags), (val)) +#define io_u_set(td, io_u, val) \ + td_flags_set((td), &(io_u)->flags, (val)) + +#endif diff -Nru fio-2.16/lib/axmap.c fio-3.1/lib/axmap.c --- fio-2.16/lib/axmap.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/axmap.c 2017-09-28 10:23:20.000000000 +0000 @@ -184,6 +184,9 @@ void axmap_clear(struct axmap *axmap, uint64_t bit_nr) { axmap_handler(axmap, bit_nr, axmap_clear_fn, NULL); + + if (bit_nr < axmap->first_free) + axmap->first_free = bit_nr; } struct axmap_set_data { @@ -191,7 +194,7 @@ unsigned int set_bits; }; -static unsigned long bit_masks[] = { +static const unsigned long bit_masks[] = { 0x0000000000000000, 0x0000000000000001, 0x0000000000000003, 0x0000000000000007, 0x000000000000000f, 0x000000000000001f, 0x000000000000003f, 0x000000000000007f, 0x00000000000000ff, 0x00000000000001ff, 0x00000000000003ff, 0x00000000000007ff, @@ -372,10 +375,9 @@ static uint64_t axmap_first_free(struct axmap *axmap) { - if (firstfree_valid(axmap)) - return axmap->first_free; + if (!firstfree_valid(axmap)) + axmap->first_free = axmap_find_first_free(axmap, axmap->nr_levels - 1, 0); - axmap->first_free = axmap_find_first_free(axmap, axmap->nr_levels - 1, 0); return axmap->first_free; } diff -Nru fio-2.16/lib/bloom.c fio-3.1/lib/bloom.c --- fio-2.16/lib/bloom.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/bloom.c 2017-09-28 10:23:20.000000000 +0000 @@ -65,6 +65,7 @@ struct bloom *b; size_t no_uints; + crc32c_arm64_probe(); crc32c_intel_probe(); b = malloc(sizeof(*b)); @@ -103,8 +104,10 @@ if (b->map[index] & (1U << bit)) was_set++; - if (set) + else if (set) b->map[index] |= 1U << bit; + else + break; } return was_set == N_HASHES; diff -Nru fio-2.16/lib/ffz.h fio-3.1/lib/ffz.h --- fio-2.16/lib/ffz.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/ffz.h 2017-09-28 10:23:20.000000000 +0000 @@ -27,10 +27,8 @@ word >>= 2; r += 2; } - if (!(word & 1)) { - word >>= 1; + if (!(word & 1)) r += 1; - } return r; } diff -Nru fio-2.16/lib/memalign.c fio-3.1/lib/memalign.c --- fio-2.16/lib/memalign.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/memalign.c 2017-09-28 10:23:20.000000000 +0000 @@ -4,13 +4,13 @@ #include "memalign.h" +#define PTR_ALIGN(ptr, mask) \ + (char *)((uintptr_t)((ptr) + (mask)) & ~(mask)) + struct align_footer { unsigned int offset; }; -#define PTR_ALIGN(ptr, mask) \ - (char *) (((uintptr_t) ((ptr) + (mask)) & ~(mask))) - void *fio_memalign(size_t alignment, size_t size) { struct align_footer *f; @@ -18,7 +18,7 @@ assert(!(alignment & (alignment - 1))); - ptr = malloc(size + alignment + size + sizeof(*f) - 1); + ptr = malloc(size + alignment + sizeof(*f) - 1); if (ptr) { ret = PTR_ALIGN(ptr, alignment - 1); f = ret + size; diff -Nru fio-2.16/lib/mountcheck.c fio-3.1/lib/mountcheck.c --- fio-2.16/lib/mountcheck.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/mountcheck.c 2017-09-28 10:23:20.000000000 +0000 @@ -4,7 +4,7 @@ #ifdef CONFIG_GETMNTENT #include -#include "lib/mountcheck.h" +#include "mountcheck.h" #define MTAB "/etc/mtab" diff -Nru fio-2.16/lib/num2str.c fio-3.1/lib/num2str.c --- fio-2.16/lib/num2str.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/num2str.c 2017-09-28 10:23:20.000000000 +0000 @@ -2,40 +2,71 @@ #include #include -#include "../fio.h" +#include "../compiler/compiler.h" +#include "num2str.h" -#define ARRAY_LENGTH(arr) sizeof(arr) / sizeof((arr)[0]) +#define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0]))) -/* - * Cheesy number->string conversion, complete with carry rounding error. +/** + * num2str() - Cheesy number->string conversion, complete with carry rounding error. + * @num: quantity (e.g., number of blocks, bytes or bits) + * @maxlen: max number of digits in the output string (not counting prefix and units, but counting .) + * @base: multiplier for num (e.g., if num represents Ki, use 1024) + * @pow2: select unit prefix - 0=power-of-10 decimal SI, nonzero=power-of-2 binary IEC + * @units: select units - N2S_* macros defined in num2str.h + * @returns a malloc'd buffer containing "number[][]" */ -char *num2str(uint64_t num, int maxlen, int base, int pow2, int unit_base) +char *num2str(uint64_t num, int maxlen, int base, int pow2, int units) { - const char *postfix[] = { "", "K", "M", "G", "P", "E" }; - const char *byte_postfix[] = { "", "B", "bit" }; + const char *sistr[] = { "", "k", "M", "G", "T", "P" }; + const char *iecstr[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi" }; + const char **unitprefix; + const char *unitstr[] = { "", "/s", "B", "bit", "B/s", "bit/s" }; const unsigned int thousand[] = { 1000, 1024 }; - unsigned int modulo, decimals; - int byte_post_index = 0, post_index, carry = 0; - char tmp[32]; + unsigned int modulo; + int unit_index = 0, post_index, carry = 0; + char tmp[32], fmt[32]; char *buf; + compiletime_assert(sizeof(sistr) == sizeof(iecstr), "unit prefix arrays must be identical sizes"); + buf = malloc(128); + if (!buf) + return NULL; + + if (pow2) + unitprefix = iecstr; + else + unitprefix = sistr; for (post_index = 0; base > 1; post_index++) base /= thousand[!!pow2]; - switch (unit_base) { - case 1: - byte_post_index = 2; + switch (units) { + case N2S_PERSEC: + unit_index = 1; + break; + case N2S_BYTE: + unit_index = 2; + break; + case N2S_BIT: + unit_index = 3; num *= 8; break; - case 8: - byte_post_index = 1; + case N2S_BYTEPERSEC: + unit_index = 4; + break; + case N2S_BITPERSEC: + unit_index = 5; + num *= 8; break; } + /* + * Divide by K/Ki until string length of num <= maxlen. + */ modulo = -1U; - while (post_index < sizeof(postfix)) { + while (post_index < sizeof(sistr)) { sprintf(tmp, "%llu", (unsigned long long) num); if (strlen(tmp) <= maxlen) break; @@ -46,33 +77,38 @@ post_index++; } + /* + * If no modulo, then we're done. + */ if (modulo == -1U) { done: - if (post_index >= ARRAY_LENGTH(postfix)) + if (post_index >= ARRAY_SIZE(sistr)) post_index = 0; sprintf(buf, "%llu%s%s", (unsigned long long) num, - postfix[post_index], byte_postfix[byte_post_index]); + unitprefix[post_index], unitstr[unit_index]); return buf; } + /* + * If no room for decimals, then we're done. + */ sprintf(tmp, "%llu", (unsigned long long) num); - decimals = maxlen - strlen(tmp); - if (decimals <= 1) { + if ((int)(maxlen - strlen(tmp)) <= 1) { if (carry) num++; goto done; } - do { - sprintf(tmp, "%u", modulo); - if (strlen(tmp) <= decimals - 1) - break; - - modulo = (modulo + 9) / 10; - } while (1); + /* + * Fill in everything and return the result. + */ + assert(maxlen - strlen(tmp) - 1 > 0); + assert(modulo < thousand[!!pow2]); + sprintf(fmt, "%%.%df", (int)(maxlen - strlen(tmp) - 1)); + sprintf(tmp, fmt, (double)modulo / (double)thousand[!!pow2]); - sprintf(buf, "%llu.%u%s%s", (unsigned long long) num, modulo, - postfix[post_index], byte_postfix[byte_post_index]); + sprintf(buf, "%llu.%s%s%s", (unsigned long long) num, &tmp[2], + unitprefix[post_index], unitstr[unit_index]); return buf; } diff -Nru fio-2.16/lib/num2str.h fio-3.1/lib/num2str.h --- fio-2.16/lib/num2str.h 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/lib/num2str.h 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,15 @@ +#ifndef FIO_NUM2STR_H +#define FIO_NUM2STR_H + +#include + +#define N2S_NONE 0 +#define N2S_BITPERSEC 1 /* match unit_base for bit rates */ +#define N2S_PERSEC 2 +#define N2S_BIT 3 +#define N2S_BYTE 4 +#define N2S_BYTEPERSEC 8 /* match unit_base for byte rates */ + +extern char *num2str(uint64_t, int, int, int, int); + +#endif diff -Nru fio-2.16/lib/output_buffer.c fio-3.1/lib/output_buffer.c --- fio-2.16/lib/output_buffer.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/output_buffer.c 2017-09-28 10:23:20.000000000 +0000 @@ -3,7 +3,6 @@ #include #include "output_buffer.h" -#include "../log.h" #include "../minmax.h" #define BUF_INC 1024 @@ -18,6 +17,7 @@ void buf_output_free(struct buf_output *out) { free(out->buf); + buf_output_init(out); } size_t buf_output_add(struct buf_output *out, const char *buf, size_t len) @@ -40,16 +40,3 @@ out->buflen += len; return len; } - -size_t buf_output_flush(struct buf_output *out) -{ - size_t ret = 0; - - if (out->buflen) { - ret = log_info_buf(out->buf, out->buflen); - memset(out->buf, 0, out->max_buflen); - out->buflen = 0; - } - - return ret; -} diff -Nru fio-2.16/lib/output_buffer.h fio-3.1/lib/output_buffer.h --- fio-2.16/lib/output_buffer.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/output_buffer.h 2017-09-28 10:23:20.000000000 +0000 @@ -12,6 +12,5 @@ void buf_output_init(struct buf_output *out); void buf_output_free(struct buf_output *out); size_t buf_output_add(struct buf_output *out, const char *buf, size_t len); -size_t buf_output_flush(struct buf_output *out); #endif diff -Nru fio-2.16/lib/pattern.c fio-3.1/lib/pattern.c --- fio-2.16/lib/pattern.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/pattern.c 2017-09-28 10:23:20.000000000 +0000 @@ -1,7 +1,77 @@ -#include "fio.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "strntol.h" #include "pattern.h" +#include "../minmax.h" #include "../oslib/strcasestr.h" +#include "../oslib/strndup.h" + +/** + * parse_file() - parses binary file to fill buffer + * @beg - string input, extract filename from this + * @out - output buffer where parsed number should be put + * @out_len - length of the output buffer + * @filled - pointer where number of bytes successfully + * parsed will be put + * + * Returns the end pointer where parsing has been stopped. + * In case of parsing error or lack of bytes in output buffer + * NULL will be returned. + */ +static const char *parse_file(const char *beg, char *out, + unsigned int out_len, + unsigned int *filled) +{ + const char *end; + char *file; + int fd; + ssize_t count; + + if (!out_len) + goto err_out; + + assert(*beg == '\''); + beg++; + end = strchr(beg, '\''); + if (!end) + goto err_out; + + file = strndup(beg, end - beg); + if (file == NULL) + goto err_out; + + fd = open(file, O_RDONLY); + if (fd < 0) + goto err_free_out; + + count = read(fd, out, out_len); + if (count == -1) + goto err_free_close_out; + + *filled = count; + close(fd); + free(file); + + /* Catch up quote */ + return end + 1; + +err_free_close_out: + close(fd); +err_free_out: + free(file); +err_out: + return NULL; + +} /** * parse_string() - parses string in double quotes, like "abc" @@ -264,6 +334,9 @@ parsed_fmt = 0; switch (*beg) { + case '\'': + end = parse_file(beg, out, out_len, &filled); + break; case '"': end = parse_string(beg, out, out_len, &filled); break; diff -Nru fio-2.16/lib/pow2.h fio-3.1/lib/pow2.h --- fio-2.16/lib/pow2.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/pow2.h 2017-09-28 10:23:20.000000000 +0000 @@ -2,8 +2,9 @@ #define FIO_POW2_H #include +#include "types.h" -static inline int is_power_of_2(uint64_t val) +static inline bool is_power_of_2(uint64_t val) { return (val != 0 && ((val & (val - 1)) == 0)); } diff -Nru fio-2.16/lib/prio_tree.c fio-3.1/lib/prio_tree.c --- fio-2.16/lib/prio_tree.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/prio_tree.c 2017-09-28 10:23:20.000000000 +0000 @@ -13,9 +13,12 @@ #include #include -#include "../fio.h" + +#include "../compiler/compiler.h" #include "prio_tree.h" +#define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0]))) + /* * A clever mix of heap and radix trees forms a radix priority search tree (PST) * which is useful for storing intervals, e.g, we can consider a vma as a closed diff -Nru fio-2.16/lib/rand.c fio-3.1/lib/rand.c --- fio-2.16/lib/rand.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/rand.c 2017-09-28 10:23:20.000000000 +0000 @@ -36,7 +36,7 @@ #include #include #include "rand.h" -#include "lib/pattern.h" +#include "pattern.h" #include "../hash.h" int arch_random; diff -Nru fio-2.16/lib/seqlock.h fio-3.1/lib/seqlock.h --- fio-2.16/lib/seqlock.h 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/lib/seqlock.h 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,49 @@ +#ifndef FIO_SEQLOCK_H +#define FIO_SEQLOCK_H + +#include "types.h" +#include "../arch/arch.h" + +struct seqlock { + volatile int sequence; +}; + +static inline void seqlock_init(struct seqlock *s) +{ + s->sequence = 0; +} + +static inline unsigned int read_seqlock_begin(struct seqlock *s) +{ + unsigned int seq; + + do { + seq = s->sequence; + if (!(seq & 1)) + break; + nop; + } while (1); + + read_barrier(); + return seq; +} + +static inline bool read_seqlock_retry(struct seqlock *s, unsigned int seq) +{ + read_barrier(); + return s->sequence != seq; +} + +static inline void write_seqlock_begin(struct seqlock *s) +{ + s->sequence++; + write_barrier(); +} + +static inline void write_seqlock_end(struct seqlock *s) +{ + write_barrier(); + s->sequence++; +} + +#endif diff -Nru fio-2.16/lib/strntol.c fio-3.1/lib/strntol.c --- fio-2.16/lib/strntol.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/strntol.c 2017-09-28 10:23:20.000000000 +0000 @@ -2,7 +2,7 @@ #include #include -#include "lib/strntol.h" +#include "strntol.h" long strntol(const char *str, size_t sz, char **end, int base) { diff -Nru fio-2.16/lib/zipf.c fio-3.1/lib/zipf.c --- fio-2.16/lib/zipf.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/lib/zipf.c 2017-09-28 10:23:20.000000000 +0000 @@ -6,7 +6,6 @@ #include #include #include "ieee754.h" -#include "../log.h" #include "zipf.h" #include "../minmax.h" #include "../hash.h" diff -Nru fio-2.16/libfio.c fio-3.1/libfio.c --- fio-2.16/libfio.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/libfio.c 2017-09-28 10:23:20.000000000 +0000 @@ -36,12 +36,7 @@ #include "helper_thread.h" #include "filehash.h" -/* - * Just expose an empty list, if the OS does not support disk util stats - */ -#ifndef FIO_HAVE_DISK_UTIL FLIST_HEAD(disk_list); -#endif unsigned long arch_flags = 0; @@ -149,10 +144,10 @@ } set_epoch_time(td, td->o.log_unix_epoch); - memcpy(&td->start, &td->epoch, sizeof(struct timeval)); - memcpy(&td->iops_sample_time, &td->epoch, sizeof(struct timeval)); - memcpy(&td->bw_sample_time, &td->epoch, sizeof(struct timeval)); - memcpy(&td->ss.prev_time, &td->epoch, sizeof(struct timeval)); + memcpy(&td->start, &td->epoch, sizeof(td->epoch)); + memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch)); + memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch)); + memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch)); lat_target_reset(td); clear_rusage_stat(td); @@ -281,7 +276,7 @@ int nr_io_threads = 0; for_each_td(td, i) { - if (td->flags & TD_F_NOIO) + if (td->io_ops_init && td_ioengine_flagged(td, FIO_NOIO)) continue; nr_io_threads++; if (td->runstate < TD_EXITED) @@ -311,6 +306,13 @@ return flags; } +enum { + ENDIAN_INVALID_BE = 1, + ENDIAN_INVALID_LE, + ENDIAN_INVALID_CONFIG, + ENDIAN_BROKEN, +}; + static int endian_check(void) { union { @@ -327,16 +329,16 @@ #if defined(CONFIG_LITTLE_ENDIAN) if (be) - return 1; + return ENDIAN_INVALID_BE; #elif defined(CONFIG_BIG_ENDIAN) if (le) - return 1; + return ENDIAN_INVALID_LE; #else - return 1; + return ENDIAN_INVALID_CONFIG; #endif if (!le && !be) - return 1; + return ENDIAN_BROKEN; return 0; } @@ -344,23 +346,45 @@ int initialize_fio(char *envp[]) { long ps; + int err; /* * We need these to be properly 64-bit aligned, otherwise we * can run into problems on archs that fault on unaligned fp * access (ARM). */ + compiletime_assert((offsetof(struct thread_data, ts) % sizeof(void *)) == 0, "ts"); compiletime_assert((offsetof(struct thread_stat, percentile_list) % 8) == 0, "stat percentile_list"); compiletime_assert((offsetof(struct thread_stat, total_run_time) % 8) == 0, "total_run_time"); compiletime_assert((offsetof(struct thread_stat, total_err_count) % 8) == 0, "total_err_count"); compiletime_assert((offsetof(struct thread_stat, latency_percentile) % 8) == 0, "stat latency_percentile"); + compiletime_assert((offsetof(struct thread_data, ts.clat_stat) % 8) == 0, "ts.clat_stat"); compiletime_assert((offsetof(struct thread_options_pack, zipf_theta) % 8) == 0, "zipf_theta"); compiletime_assert((offsetof(struct thread_options_pack, pareto_h) % 8) == 0, "pareto_h"); compiletime_assert((offsetof(struct thread_options_pack, percentile_list) % 8) == 0, "percentile_list"); compiletime_assert((offsetof(struct thread_options_pack, latency_percentile) % 8) == 0, "latency_percentile"); + compiletime_assert((offsetof(struct jobs_eta, m_rate) % 8) == 0, "m_rate"); - if (endian_check()) { + err = endian_check(); + if (err) { log_err("fio: endianness settings appear wrong.\n"); + switch (err) { + case ENDIAN_INVALID_BE: + log_err("fio: got big-endian when configured for little\n"); + break; + case ENDIAN_INVALID_LE: + log_err("fio: got little-endian when configured for big\n"); + break; + case ENDIAN_INVALID_CONFIG: + log_err("fio: not configured to any endianness\n"); + break; + case ENDIAN_BROKEN: + log_err("fio: failed to detect endianness\n"); + break; + default: + assert(0); + break; + } log_err("fio: please report this to fio@vger.kernel.org\n"); return 1; } diff -Nru fio-2.16/log.c fio-3.1/log.c --- fio-2.16/log.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/log.c 2017-09-28 10:23:20.000000000 +0000 @@ -6,8 +6,16 @@ #include "fio.h" +#define LOG_START_SZ 512 + size_t log_info_buf(const char *buf, size_t len) { + /* + * buf could be NULL (not just ""). + */ + if (!buf) + return 0; + if (is_backend) { size_t ret = fio_server_text_output(FIO_LOG_INFO, buf, len); if (ret != -1) @@ -21,40 +29,66 @@ return fwrite(buf, len, 1, f_out); } -size_t log_valist(const char *str, va_list args) +static size_t valist_to_buf(char **buffer, const char *fmt, va_list src_args) { - char buffer[1024]; + size_t len, cur = LOG_START_SZ; + va_list args; + + do { + *buffer = calloc(1, cur); + + va_copy(args, src_args); + len = vsnprintf(*buffer, cur, fmt, args); + va_end(args); + + if (len < cur) + break; + + cur = len + 1; + free(*buffer); + } while (1); + + return len; +} + +size_t log_valist(const char *fmt, va_list args) +{ + char *buffer; size_t len; - len = vsnprintf(buffer, sizeof(buffer), str, args); + len = valist_to_buf(&buffer, fmt, args); + len = log_info_buf(buffer, len); + free(buffer); - return log_info_buf(buffer, min(len, sizeof(buffer) - 1)); + return len; } size_t log_info(const char *format, ...) { - char buffer[1024]; va_list args; - size_t len; + size_t ret; va_start(args, format); - len = vsnprintf(buffer, sizeof(buffer), format, args); + ret = log_valist(format, args); va_end(args); - return log_info_buf(buffer, min(len, sizeof(buffer) - 1)); + return ret; } size_t __log_buf(struct buf_output *buf, const char *format, ...) { - char buffer[1024]; + char *buffer; va_list args; size_t len; va_start(args, format); - len = vsnprintf(buffer, sizeof(buffer), format, args); + len = valist_to_buf(&buffer, format, args); va_end(args); - return buf_output_add(buf, buffer, min(len, sizeof(buffer) - 1)); + len = buf_output_add(buf, buffer, len); + free(buffer); + + return len; } int log_info_flush(void) @@ -67,33 +101,33 @@ size_t log_err(const char *format, ...) { - char buffer[1024]; + size_t ret, len; + char *buffer; va_list args; - size_t len; va_start(args, format); - len = vsnprintf(buffer, sizeof(buffer), format, args); + len = valist_to_buf(&buffer, format, args); va_end(args); - len = min(len, sizeof(buffer) - 1); if (is_backend) { - size_t ret = fio_server_text_output(FIO_LOG_ERR, buffer, len); + ret = fio_server_text_output(FIO_LOG_ERR, buffer, len); if (ret != -1) - return ret; + goto done; } if (log_syslog) { syslog(LOG_INFO, "%s", buffer); - return len; + ret = len; } else { - if (f_err != stderr) { - int fio_unused ret; - + if (f_err != stderr) ret = fwrite(buffer, len, 1, stderr); - } - return fwrite(buffer, len, 1, f_err); + ret = fwrite(buffer, len, 1, f_err); } + +done: + free(buffer); + return ret; } const char *log_get_level(int level) diff -Nru fio-2.16/log.h fio-3.1/log.h --- fio-2.16/log.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/log.h 2017-09-28 10:23:20.000000000 +0000 @@ -16,13 +16,15 @@ extern size_t log_info_buf(const char *buf, size_t len); extern int log_info_flush(void); -#define log_buf(buf, format, args...) \ -do { \ - if ((buf) != NULL) \ - __log_buf(buf, format, ##args); \ - else \ - log_info(format, ##args); \ -} while (0) +#define log_buf(buf, format, args...) \ +({ \ + size_t __ret; \ + if ((buf) != NULL) \ + __ret = __log_buf(buf, format, ##args); \ + else \ + __ret = log_info(format, ##args); \ + __ret; \ +}) enum { FIO_LOG_DEBUG = 1, diff -Nru fio-2.16/Makefile fio-3.1/Makefile --- fio-2.16/Makefile 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/Makefile 2017-09-28 10:23:20.000000000 +0000 @@ -26,7 +26,7 @@ CFLAGS = -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR) LIBS += -lm $(EXTLIBS) PROGS = fio -SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/fio_latency2csv.py tools/hist/fiologparser_hist.py) +SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/fio_jsonplus_clat2csv) ifndef CONFIG_FIO_NO_OPT CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 @@ -36,12 +36,13 @@ PROGS += gfio endif -SOURCE := $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \ - $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/lib/*.c)) \ +SOURCE := $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \ + $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/lib/*.c))) \ gettime.c ioengines.c init.c stat.c log.c time.c filesetup.c \ eta.c verify.c memory.c io_u.c parse.c mutex.c options.c \ smalloc.c filehash.c profile.c debug.c engines/cpu.c \ engines/mmap.c engines/sync.c engines/null.c engines/net.c \ + engines/ftruncate.c \ server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \ gettime-thread.c helpers.c json.c idletime.c td_error.c \ profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \ @@ -106,6 +107,9 @@ ifndef CONFIG_STRLCAT SOURCE += oslib/strlcat.c endif +ifndef CONFIG_HAVE_STRNDUP + SOURCE += oslib/strndup.c +endif ifndef CONFIG_GETOPT_LONG_ONLY SOURCE += oslib/getopt_long.c endif @@ -139,7 +143,7 @@ LDFLAGS += -rdynamic endif ifeq ($(CONFIG_TARGET_OS), Android) - SOURCE += diskutil.c fifo.c blktrace.c trim.c profiles/tiobench.c \ + SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \ oslib/linux-dev-lookup.c LIBS += -ldl LDFLAGS += -rdynamic @@ -179,7 +183,6 @@ LIBS += -lpthread -ldl endif ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS))) - SOURCE := $(filter-out engines/mmap.c,$(SOURCE)) SOURCE += os/windows/posix.c LIBS += -lpthread -lpsapi -lws2_32 CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format -static @@ -209,7 +212,8 @@ T_ZIPF_OBS = t/genzipf.o T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/pattern.o lib/zipf.o \ - lib/strntol.o lib/gauss.o t/genzipf.o oslib/strcasestr.o + lib/strntol.o lib/gauss.o t/genzipf.o oslib/strcasestr.o \ + oslib/strndup.o T_ZIPF_PROGS = t/fio-genzipf T_AXMAP_OBJS = t/axmap.o @@ -222,7 +226,7 @@ T_GEN_RAND_OBJS = t/gen-rand.o T_GEN_RAND_OBJS += t/log.o t/debug.o lib/rand.o lib/pattern.o lib/strntol.o \ - oslib/strcasestr.o + oslib/strcasestr.o oslib/strndup.o T_GEN_RAND_PROGS = t/gen-rand ifeq ($(CONFIG_TARGET_OS), Linux) @@ -234,10 +238,10 @@ T_DEDUPE_OBJS = t/dedupe.o T_DEDUPE_OBJS += lib/rbtree.o t/log.o mutex.o smalloc.o gettime.o crc/md5.o \ lib/memalign.o lib/bloom.o t/debug.o crc/xxhash.o t/arch.o \ - crc/murmur3.o crc/crc32c.o crc/crc32c-intel.o crc/fnv.o + crc/murmur3.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o crc/fnv.o T_DEDUPE_PROGS = t/fio-dedupe -T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o t/debug.o +T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o t/debug.o T_VS_PROGS = t/fio-verify-state T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o @@ -246,6 +250,9 @@ T_MEMLOCK_OBJS = t/memlock.o T_MEMLOCK_PROGS = t/memlock +T_TT_OBJS = t/time-test.o +T_TT_PROGS = t/time-test + T_OBJS = $(T_SMALLOC_OBJS) T_OBJS += $(T_IEEE_OBJS) T_OBJS += $(T_ZIPF_OBJS) @@ -257,6 +264,7 @@ T_OBJS += $(T_VS_OBJS) T_OBJS += $(T_PIPE_ASYNC_OBJS) T_OBJS += $(T_MEMLOCK_OBJS) +T_OBJS += $(T_TT_OBJS) ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS))) T_DEDUPE_OBJS += os/windows/posix.o lib/hweight.o @@ -304,7 +312,7 @@ all: $(PROGS) $(T_TEST_PROGS) $(SCRIPTS) FORCE -.PHONY: all install clean +.PHONY: all install clean test .PHONY: FORCE cscope FIO-VERSION-FILE: FORCE @@ -319,8 +327,13 @@ @$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d @mv -f $*.d $*.d.tmp @sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d +ifeq ($(CONFIG_TARGET_OS), NetBSD) + @sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | tr -cs "[:graph:]" "\n" | \ + sed -e 's/^ *//' -e '/^$$/ d' -e 's/$$/:/' >> $*.d +else @sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $*.d +endif @rm -f $*.d.tmp ifdef CONFIG_ARITHMETIC @@ -358,8 +371,13 @@ @$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d @mv -f $*.d $*.d.tmp @sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d +ifeq ($(CONFIG_TARGET_OS), NetBSD) + @sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | tr -cs "[:graph:]" "\n" | \ + sed -e 's/^ *//' -e '/^$$/ d' -e 's/$$/:/' >> $*.d +else @sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $*.d +endif @rm -f $*.d.tmp gcompat.o: gcompat.c gcompat.h @@ -430,8 +448,12 @@ t/fio-verify-state: $(T_VS_OBJS) $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_VS_OBJS) $(LIBS) +t/time-test: $(T_TT_OBJS) + $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_TT_OBJS) $(LIBS) + clean: FORCE @rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio FIO-VERSION-FILE *.d lib/*.d oslib/*.d crc/*.d engines/*.d profiles/*.d t/*.d config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h + @rm -rf doc/output distclean: clean FORCE @rm -f cscope.out fio.pdf fio_generate_plots.pdf fio2gnuplot.pdf fiologparser_hist.pdf @@ -448,7 +470,8 @@ @man -t tools/plot/fio2gnuplot.1 | ps2pdf - fio2gnuplot.pdf @man -t tools/hist/fiologparser_hist.py.1 | ps2pdf - fiologparser_hist.pdf -test: +test: fio + ./fio --minimal --thread --exitall_on_error --runtime=1s --name=nulltest --ioengine=null --rw=randrw --iodepth=2 --norandommap --random_generator=tausworthe64 --size=16T --name=verifyfstest --filename=fiotestfile.tmp --unlink=1 --rw=write --verify=crc32c --verify_state_save=0 --size=16K install: $(PROGS) $(SCRIPTS) tools/plot/fio2gnuplot.1 FORCE $(INSTALL) -m 755 -d $(DESTDIR)$(bindir) diff -Nru fio-2.16/memory.c fio-3.1/memory.c --- fio-2.16/memory.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/memory.c 2017-09-28 10:23:20.000000000 +0000 @@ -33,13 +33,13 @@ dprint(FD_MEM, "pinning %llu bytes\n", td->o.lockmem); /* - * Don't allow mlock of more than real_mem-128MB + * Don't allow mlock of more than real_mem-128MiB */ phys_mem = os_phys_mem(); if (phys_mem) { if ((td->o.lockmem + 128 * 1024 * 1024) > phys_mem) { td->o.lockmem = phys_mem - 128 * 1024 * 1024; - log_info("fio: limiting mlocked memory to %lluMB\n", + log_info("fio: limiting mlocked memory to %lluMiB\n", td->o.lockmem >> 20); } } @@ -138,6 +138,9 @@ } if (td->o.mmapfile) { + if (access(td->o.mmapfile, F_OK) == 0) + td->flags |= TD_F_MMAP_KEEP; + td->mmapfd = open(td->o.mmapfile, O_RDWR|O_CREAT, 0644); if (td->mmapfd < 0) { @@ -169,7 +172,7 @@ td->orig_buffer = NULL; if (td->mmapfd != 1 && td->mmapfd != -1) { close(td->mmapfd); - if (td->o.mmapfile) + if (td->o.mmapfile && !(td->flags & TD_F_MMAP_KEEP)) unlink(td->o.mmapfile); } @@ -187,7 +190,8 @@ if (td->o.mmapfile) { if (td->mmapfd != -1) close(td->mmapfd); - unlink(td->o.mmapfile); + if (!(td->flags & TD_F_MMAP_KEEP)) + unlink(td->o.mmapfile); free(td->o.mmapfile); } } @@ -207,6 +211,78 @@ free(td->orig_buffer); } +static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem) +{ +#ifdef CONFIG_CUDA + CUresult ret; + char name[128]; + + ret = cuInit(0); + if (ret != CUDA_SUCCESS) { + log_err("fio: failed initialize cuda driver api\n"); + return 1; + } + + ret = cuDeviceGetCount(&td->gpu_dev_cnt); + if (ret != CUDA_SUCCESS) { + log_err("fio: failed get device count\n"); + return 1; + } + dprint(FD_MEM, "found %d GPU devices\n", td->gpu_dev_cnt); + + if (td->gpu_dev_cnt == 0) { + log_err("fio: no GPU device found. " + "Can not perform GPUDirect RDMA.\n"); + return 1; + } + + td->gpu_dev_id = td->o.gpu_dev_id; + ret = cuDeviceGet(&td->cu_dev, td->gpu_dev_id); + if (ret != CUDA_SUCCESS) { + log_err("fio: failed get GPU device\n"); + return 1; + } + + ret = cuDeviceGetName(name, sizeof(name), td->gpu_dev_id); + if (ret != CUDA_SUCCESS) { + log_err("fio: failed get device name\n"); + return 1; + } + dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \ + td->gpu_dev_id, name); + + ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev); + if (ret != CUDA_SUCCESS) { + log_err("fio: failed to create cuda context: %d\n", ret); + return 1; + } + + ret = cuMemAlloc(&td->dev_mem_ptr, total_mem); + if (ret != CUDA_SUCCESS) { + log_err("fio: cuMemAlloc %zu bytes failed\n", total_mem); + return 1; + } + td->orig_buffer = (void *) td->dev_mem_ptr; + + dprint(FD_MEM, "cudaMalloc %llu %p\n", \ + (unsigned long long) total_mem, td->orig_buffer); + return 0; +#else + return -EINVAL; +#endif +} + +static void free_mem_cudamalloc(struct thread_data *td) +{ +#ifdef CONFIG_CUDA + if (td->dev_mem_ptr != NULL) + cuMemFree(td->dev_mem_ptr); + + if (cuCtxDestroy(td->cu_ctx) != CUDA_SUCCESS) + log_err("fio: failed to destroy cuda context\n"); +#endif +} + /* * Set up the buffer area we need for io. */ @@ -246,6 +322,8 @@ else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE || td->o.mem_type == MEM_MMAPSHARED) ret = alloc_mem_mmap(td, total_mem); + else if (td->o.mem_type == MEM_CUDA_MALLOC) + ret = alloc_mem_cudamalloc(td, total_mem); else { log_err("fio: bad mem type: %d\n", td->o.mem_type); ret = 1; @@ -275,6 +353,8 @@ else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE || td->o.mem_type == MEM_MMAPSHARED) free_mem_mmap(td, total_mem); + else if (td->o.mem_type == MEM_CUDA_MALLOC) + free_mem_cudamalloc(td); else log_err("Bad memory type %u\n", td->o.mem_type); diff -Nru fio-2.16/mutex.c fio-3.1/mutex.c --- fio-2.16/mutex.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/mutex.c 2017-09-28 10:23:20.000000000 +0000 @@ -47,7 +47,7 @@ return ret; } -#ifdef FIO_HAVE_PSHARED_MUTEX +#ifdef CONFIG_PSHARED ret = pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED); if (ret) { log_err("pthread_condattr_setpshared: %s\n", strerror(ret)); @@ -77,7 +77,7 @@ /* * Not all platforms support process shared mutexes (FreeBSD) */ -#ifdef FIO_HAVE_PSHARED_MUTEX +#ifdef CONFIG_PSHARED ret = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); if (ret) { log_err("pthread_mutexattr_setpshared: %s\n", strerror(ret)); @@ -141,11 +141,15 @@ return NULL; } -static bool mutex_timed_out(struct timeval *t, unsigned int msecs) +static bool mutex_timed_out(struct timespec *t, unsigned int msecs) { - struct timeval now; + struct timeval tv; + struct timespec now; + + gettimeofday(&tv, NULL); + now.tv_sec = tv.tv_sec; + now.tv_nsec = tv.tv_usec * 1000; - gettimeofday(&now, NULL); return mtime_since(t, &now) >= msecs; } @@ -177,7 +181,7 @@ * way too early, double check. */ ret = pthread_cond_timedwait(&mutex->cond, &mutex->lock, &t); - if (ret == ETIMEDOUT && !mutex_timed_out(&tv_s, msecs)) + if (ret == ETIMEDOUT && !mutex_timed_out(&t, msecs)) ret = 0; } mutex->waiters--; @@ -287,7 +291,7 @@ log_err("pthread_rwlockattr_init: %s\n", strerror(ret)); goto err; } -#ifdef FIO_HAVE_PSHARED_MUTEX +#ifdef CONFIG_PSHARED ret = pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); if (ret) { log_err("pthread_rwlockattr_setpshared: %s\n", strerror(ret)); diff -Nru fio-2.16/optgroup.c fio-3.1/optgroup.c --- fio-2.16/optgroup.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/optgroup.c 2017-09-28 10:23:20.000000000 +0000 @@ -31,16 +31,16 @@ .mask = FIO_OPT_C_PROFILE, }, { + .name = "I/O engines", + .mask = FIO_OPT_C_ENGINE, + }, + { .name = NULL, }, }; static const struct opt_group fio_opt_cat_groups[] = { { - .name = "Latency profiling", - .mask = FIO_OPT_G_LATPROF, - }, - { .name = "Rate", .mask = FIO_OPT_G_RATE, }, @@ -125,13 +125,52 @@ .mask = FIO_OPT_G_TIOBENCH, }, { - .name = "MTD", + .name = "Error handling", + .mask = FIO_OPT_G_ERR, + }, + { + .name = "Ext4 defrag I/O engine", /* e4defrag */ + .mask = FIO_OPT_G_E4DEFRAG, + }, + { + .name = "Network I/O engine", /* net */ + .mask = FIO_OPT_G_NETIO, + }, + { + .name = "RDMA I/O engine", /* rdma */ + .mask = FIO_OPT_G_RDMA, + }, + { + .name = "libaio I/O engine", /* libaio */ + .mask = FIO_OPT_G_LIBAIO, + }, + { + .name = "ACT Aerospike like benchmark profile", + .mask = FIO_OPT_G_ACT, + }, + { + .name = "Latency profiling", + .mask = FIO_OPT_G_LATPROF, + }, + { + .name = "RBD I/O engine", /* rbd */ + .mask = FIO_OPT_G_RBD, + }, + { + .name = "GlusterFS I/O engine", /* gfapi,gfapi_async */ + .mask = FIO_OPT_G_GFAPI, + }, + { + .name = "MTD I/O engine", /* mtd */ .mask = FIO_OPT_G_MTD, }, - + { + .name = "libhdfs I/O engine", /* libhdfs */ + .mask = FIO_OPT_G_HDFS, + }, { .name = NULL, - } + }, }; static const struct opt_group *group_from_mask(const struct opt_group *ogs, diff -Nru fio-2.16/options.c fio-3.1/options.c --- fio-2.16/options.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/options.c 2017-09-28 10:23:20.000000000 +0000 @@ -270,7 +270,8 @@ return 0; } -static int ignore_error_type(struct thread_data *td, int etype, char *str) +static int ignore_error_type(struct thread_data *td, enum error_type_bit etype, + char *str) { unsigned int i; int *error; @@ -282,7 +283,7 @@ } td->o.ignore_error_nr[etype] = 4; - error = malloc(4 * sizeof(struct bssplit)); + error = calloc(4, sizeof(int)); i = 0; while ((fname = strsep(&str, ":")) != NULL) { @@ -306,8 +307,9 @@ error[i] = -error[i]; } if (!error[i]) { - log_err("Unknown error %s, please use number value \n", + log_err("Unknown error %s, please use number value\n", fname); + td->o.ignore_error_nr[etype] = 0; free(error); return 1; } @@ -317,8 +319,10 @@ td->o.continue_on_error |= 1 << etype; td->o.ignore_error_nr[etype] = i; td->o.ignore_error[etype] = error; - } else + } else { + td->o.ignore_error_nr[etype] = 0; free(error); + } return 0; @@ -328,7 +332,8 @@ { struct thread_data *td = cb_data_to_td(data); char *str, *p, *n; - int type = 0, ret = 1; + int ret = 1; + enum error_type_bit type = 0; if (parse_dryrun()) return 0; @@ -1233,6 +1238,9 @@ strip_blank_front(&str); strip_blank_end(str); + /* + * Ignore what we may already have from nrfiles option. + */ if (!td->files_index) td->o.nr_files = 0; @@ -1303,8 +1311,17 @@ assert(ret != 0); td->o.buffer_pattern_bytes = ret; - if (!td->o.compress_percentage) + + /* + * If this job is doing any reading or has compression set, + * ensure that we refill buffers for writes or we could be + * invalidating the pattern through reads. + */ + if (!td->o.compress_percentage && !td_read(td)) td->o.refill_buffers = 0; + else + td->o.refill_buffers = 1; + td->o.scramble_buffers = 0; td->o.zero_buffers = 0; @@ -1364,7 +1381,23 @@ td->o.disable_bw = !!val; td->o.clat_percentiles = !val; if (val) - td->tv_cache_mask = 63; + td->ts_cache_mask = 63; + + return 0; +} + +static int str_offset_cb(void *data, unsigned long long *__val) +{ + struct thread_data *td = cb_data_to_td(data); + unsigned long long v = *__val; + + if (parse_is_percent(v)) { + td->o.start_offset = 0; + td->o.start_offset_percent = -1ULL - v; + dprint(FD_PARSE, "SET start_offset_percent %d\n", + td->o.start_offset_percent); + } else + td->o.start_offset = v; return 0; } @@ -1377,6 +1410,8 @@ if (parse_is_percent(v)) { td->o.size = 0; td->o.size_percent = -1ULL - v; + dprint(FD_PARSE, "SET size_percent %d\n", + td->o.size_percent); } else td->o.size = v; @@ -1427,6 +1462,39 @@ return 0; } +/* + * str is supposed to be a substring of the strdup'd original string, + * and is valid only if it's a regular file path. + * This function keeps the pointer to the path as needed later. + * + * "external:/path/to/so\0" <- original pointer updated with strdup'd + * "external\0" <- above pointer after parsed, i.e. ->ioengine + * "/path/to/so\0" <- str argument, i.e. ->ioengine_so_path + */ +static int str_ioengine_external_cb(void *data, const char *str) +{ + struct thread_data *td = cb_data_to_td(data); + struct stat sb; + char *p; + + if (!str) { + log_err("fio: null external ioengine path\n"); + return 1; + } + + p = (char *)str; /* str is mutable */ + strip_blank_front(&p); + strip_blank_end(p); + + if (stat(p, &sb) || !S_ISREG(sb.st_mode)) { + log_err("fio: invalid external ioengine path \"%s\"\n", p); + return 1; + } + + td->o.ioengine_so_path = p; + return 0; +} + static int rw_verify(struct fio_option *o, void *data) { struct thread_data *td = cb_data_to_td(data); @@ -1777,6 +1845,7 @@ #endif { .ival = "external", .help = "Load external engine (append name)", + .cb = str_ioengine_external_cb, }, }, }, @@ -1847,6 +1916,17 @@ .group = FIO_OPT_G_IO_BASIC, }, { + .name = "serialize_overlap", + .lname = "Serialize overlap", + .off1 = offsetof(struct thread_options, serialize_overlap), + .type = FIO_OPT_BOOL, + .help = "Wait for in-flight IOs that collide to complete", + .parent = "iodepth", + .def = "0", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_IO_BASIC, + }, + { .name = "io_submit_mode", .lname = "IO submit mode", .type = FIO_OPT_STR, @@ -1882,7 +1962,8 @@ .alias = "io_limit", .lname = "IO Size", .type = FIO_OPT_STR_VAL, - .off1 = offsetof(struct thread_options, io_limit), + .off1 = offsetof(struct thread_options, io_size), + .help = "Total size of I/O to be performed", .interval = 1024 * 1024, .category = FIO_OPT_C_IO, .group = FIO_OPT_G_INVALID, @@ -1925,6 +2006,7 @@ .lname = "IO offset", .alias = "fileoffset", .type = FIO_OPT_STR_VAL, + .cb = str_offset_cb, .off1 = offsetof(struct thread_options, start_offset), .help = "Start IO from this offset", .def = "0", @@ -1965,7 +2047,7 @@ .off3 = offsetof(struct thread_options, bs[DDIR_TRIM]), .minval = 1, .help = "Block size unit", - .def = "4k", + .def = "4096", .parent = "rw", .hide = 1, .interval = 512, @@ -2232,9 +2314,13 @@ .oval = FIO_FSERVICE_PARETO, .help = "Pareto randomized", }, + { .ival = "normal", + .oval = FIO_FSERVICE_GAUSS, + .help = "Normal (Gaussian) randomized", + }, { .ival = "gauss", .oval = FIO_FSERVICE_GAUSS, - .help = "Normal (gaussian) distribution", + .help = "Alias for normal", }, { .ival = "roundrobin", .oval = FIO_FSERVICE_RR, @@ -2248,14 +2334,14 @@ .parent = "nrfiles", .hide = 1, }, -#ifdef CONFIG_POSIX_FALLOCATE +#ifdef FIO_HAVE_ANY_FALLOCATE { .name = "fallocate", .lname = "Fallocate", .type = FIO_OPT_STR, .off1 = offsetof(struct thread_options, fallocate_mode), .help = "Whether pre-allocation is performed when laying out files", - .def = "posix", + .def = "native", .category = FIO_OPT_C_FILE, .group = FIO_OPT_G_INVALID, .posval = { @@ -2263,10 +2349,16 @@ .oval = FIO_FALLOCATE_NONE, .help = "Do not pre-allocate space", }, + { .ival = "native", + .oval = FIO_FALLOCATE_NATIVE, + .help = "Use native pre-allocation if possible", + }, +#ifdef CONFIG_POSIX_FALLOCATE { .ival = "posix", .oval = FIO_FALLOCATE_POSIX, .help = "Use posix_fallocate()", }, +#endif #ifdef CONFIG_LINUX_FALLOCATE { .ival = "keep", .oval = FIO_FALLOCATE_KEEP_SIZE, @@ -2278,20 +2370,22 @@ .oval = FIO_FALLOCATE_NONE, .help = "Alias for 'none'", }, +#ifdef CONFIG_POSIX_FALLOCATE { .ival = "1", .oval = FIO_FALLOCATE_POSIX, .help = "Alias for 'posix'", }, +#endif }, }, -#else /* CONFIG_POSIX_FALLOCATE */ +#else /* FIO_HAVE_ANY_FALLOCATE */ { .name = "fallocate", .lname = "Fallocate", .type = FIO_OPT_UNSUPPORTED, .help = "Your platform does not support fallocate", }, -#endif /* CONFIG_POSIX_FALLOCATE */ +#endif /* FIO_HAVE_ANY_FALLOCATE */ { .name = "fadvise_hint", .lname = "Fadvise hint", @@ -2320,24 +2414,6 @@ .category = FIO_OPT_C_FILE, .group = FIO_OPT_G_INVALID, }, -#ifdef FIO_HAVE_STREAMID - { - .name = "fadvise_stream", - .lname = "Fadvise stream", - .type = FIO_OPT_INT, - .off1 = offsetof(struct thread_options, fadvise_stream), - .help = "Use fadvise() to set stream ID", - .category = FIO_OPT_C_FILE, - .group = FIO_OPT_G_INVALID, - }, -#else - { - .name = "fadvise_stream", - .lname = "Fadvise stream", - .type = FIO_OPT_UNSUPPORTED, - .help = "Your platform does not support fadvise stream ID", - }, -#endif { .name = "fsync", .lname = "Fsync", @@ -2600,6 +2676,12 @@ .help = "Like mmap, but use huge pages", }, #endif +#ifdef CONFIG_CUDA + { .ival = "cudamalloc", + .oval = MEM_CUDA_MALLOC, + .help = "Allocate GPU device memory for GPUDirect RDMA", + }, +#endif }, }, { @@ -2670,6 +2752,22 @@ .oval = VERIFY_SHA512, .help = "Use sha512 checksums for verification", }, + { .ival = "sha3-224", + .oval = VERIFY_SHA3_224, + .help = "Use sha3-224 checksums for verification", + }, + { .ival = "sha3-256", + .oval = VERIFY_SHA3_256, + .help = "Use sha3-256 checksums for verification", + }, + { .ival = "sha3-384", + .oval = VERIFY_SHA3_384, + .help = "Use sha3-384 checksums for verification", + }, + { .ival = "sha3-512", + .oval = VERIFY_SHA3_512, + .help = "Use sha3-512 checksums for verification", + }, { .ival = "xxhash", .oval = VERIFY_XXHASH, .help = "Use xxhash checksums for verification", @@ -2885,7 +2983,7 @@ .off1 = offsetof(struct thread_options, trim_percentage), .minval = 0, .maxval = 100, - .help = "Number of verify blocks to discard/trim", + .help = "Number of verify blocks to trim (i.e., discard)", .parent = "verify", .def = "0", .interval = 1, @@ -2897,7 +2995,7 @@ .name = "trim_verify_zero", .lname = "Verify trim zero", .type = FIO_OPT_BOOL, - .help = "Verify that trim/discarded blocks are returned as zeroes", + .help = "Verify that trimmed (i.e., discarded) blocks are returned as zeroes", .off1 = offsetof(struct thread_options, trim_zero), .parent = "trim_percentage", .hide = 1, @@ -3377,6 +3475,34 @@ .category = FIO_OPT_C_IO, .group = FIO_OPT_G_IO_TYPE, }, +#ifdef FIO_HAVE_WRITE_HINT + { + .name = "write_hint", + .lname = "Write hint", + .type = FIO_OPT_STR, + .off1 = offsetof(struct thread_options, write_hint), + .help = "Set expected write life time", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + .posval = { + { .ival = "none", + .oval = RWH_WRITE_LIFE_NONE, + }, + { .ival = "short", + .oval = RWH_WRITE_LIFE_SHORT, + }, + { .ival = "medium", + .oval = RWH_WRITE_LIFE_MEDIUM, + }, + { .ival = "long", + .oval = RWH_WRITE_LIFE_LONG, + }, + { .ival = "extreme", + .oval = RWH_WRITE_LIFE_EXTREME, + }, + }, + }, +#endif { .name = "create_serialize", .lname = "Create serialize", @@ -3543,6 +3669,18 @@ .help = "Build fio with libnuma-dev(el) to enable this option", }, #endif +#ifdef CONFIG_CUDA + { + .name = "gpu_dev_id", + .lname = "GPU device ID", + .type = FIO_OPT_INT, + .off1 = offsetof(struct thread_options, gpu_dev_id), + .help = "Set GPU device ID for GPUDirect RDMA", + .def = "0", + .category = FIO_OPT_C_GENERAL, + .group = FIO_OPT_G_INVALID, + }, +#endif { .name = "end_fsync", .lname = "End fsync", @@ -3846,6 +3984,16 @@ .group = FIO_OPT_G_INVALID, }, { + .name = "stats", + .lname = "Stats", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct thread_options, stats), + .help = "Enable collection of stats", + .def = "1", + .category = FIO_OPT_C_STAT, + .group = FIO_OPT_G_INVALID, + }, + { .name = "zero_buffers", .lname = "Zero I/O buffers", .type = FIO_OPT_STR_SET, @@ -3928,6 +4076,18 @@ .off1 = offsetof(struct thread_options, clat_percentiles), .help = "Enable the reporting of completion latency percentiles", .def = "1", + .inverse = "lat_percentiles", + .category = FIO_OPT_C_STAT, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "lat_percentiles", + .lname = "IO latency percentiles", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct thread_options, lat_percentiles), + .help = "Enable the reporting of IO latency percentiles", + .def = "0", + .inverse = "clat_percentiles", .category = FIO_OPT_C_STAT, .group = FIO_OPT_G_INVALID, }, @@ -4180,20 +4340,20 @@ .posval = { { .ival = "1024", .oval = 1024, - .help = "Use 1024 as the K base", + .help = "Inputs invert IEC and SI prefixes (for compatibility); outputs prefer binary", }, { .ival = "1000", .oval = 1000, - .help = "Use 1000 as the K base", + .help = "Inputs use IEC and SI prefixes; outputs prefer SI", }, }, - .help = "How many bytes per KB for reporting (1000 or 1024)", + .help = "Unit prefix interpretation for quantities of data (IEC and SI)", .category = FIO_OPT_C_GENERAL, .group = FIO_OPT_G_INVALID, }, { .name = "unit_base", - .lname = "Base unit for reporting (Bits or Bytes)", + .lname = "Unit for quantities of data (Bits or Bytes)", .type = FIO_OPT_INT, .off1 = offsetof(struct thread_options, unit_base), .prio = 1, @@ -4276,17 +4436,6 @@ .group = FIO_OPT_G_IO_FLOW, }, { - .name = "skip_bad", - .lname = "Skip operations against bad blocks", - .type = FIO_OPT_BOOL, - .off1 = offsetof(struct thread_options, skip_bad), - .help = "Skip operations against known bad blocks.", - .hide = 1, - .def = "0", - .category = FIO_OPT_C_IO, - .group = FIO_OPT_G_MTD, - }, - { .name = "steadystate", .lname = "Steady state threshold", .alias = "ss", @@ -4321,6 +4470,7 @@ .name = "steadystate_duration", .lname = "Steady state duration", .alias = "ss_dur", + .parent = "steadystate", .type = FIO_OPT_STR_VAL_TIME, .off1 = offsetof(struct thread_options, ss_dur), .help = "Stop workload upon attaining steady state for specified duration", @@ -4334,6 +4484,7 @@ .name = "steadystate_ramp_time", .lname = "Steady state ramp time", .alias = "ss_ramp", + .parent = "steadystate", .type = FIO_OPT_STR_VAL_TIME, .off1 = offsetof(struct thread_options, ss_ramp_time), .help = "Delay before initiation of data collection for steady state job termination testing", @@ -4769,34 +4920,19 @@ return show_cmd_help(fio_options, opt); } -void options_mem_dupe(void *data, struct fio_option *options) -{ - struct fio_option *o; - char **ptr; - - for (o = &options[0]; o->name; o++) { - if (o->type != FIO_OPT_STR_STORE) - continue; - - ptr = td_var(data, o, o->off1); - if (*ptr) - *ptr = strdup(*ptr); - } -} - /* * dupe FIO_OPT_STR_STORE options */ void fio_options_mem_dupe(struct thread_data *td) { - options_mem_dupe(&td->o, fio_options); + options_mem_dupe(fio_options, &td->o); if (td->eo && td->io_ops) { void *oldeo = td->eo; td->eo = malloc(td->io_ops->option_struct_size); memcpy(td->eo, oldeo, td->io_ops->option_struct_size); - options_mem_dupe(td->eo, td->io_ops->options); + options_mem_dupe(td->io_ops->options, td->eo); } } diff -Nru fio-2.16/os/os-aix.h fio-3.1/os/os-aix.h --- fio-2.16/os/os-aix.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-aix.h 2017-09-28 10:23:20.000000000 +0000 @@ -14,8 +14,6 @@ #define FIO_USE_GENERIC_RAND #define FIO_USE_GENERIC_INIT_RANDOM_STATE -#define FIO_HAVE_PSHARED_MUTEX - #define OS_MAP_ANON MAP_ANON #define OS_MSG_DONTWAIT 0 @@ -23,7 +21,7 @@ static inline int blockdev_invalidate_cache(struct fio_file *f) { - return EINVAL; + return ENOTSUP; } static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes) diff -Nru fio-2.16/os/os-android.h fio-3.1/os/os-android.h --- fio-2.16/os/os-android.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-android.h 2017-09-28 10:23:20.000000000 +0000 @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,10 @@ #include "binject.h" #include "../file.h" +#ifndef __has_builtin // Optional of course. + #define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#endif + #define FIO_HAVE_DISK_UTIL #define FIO_HAVE_IOSCHED_SWITCH #define FIO_HAVE_IOPRIO @@ -27,8 +32,8 @@ #define FIO_HAVE_ODIRECT #define FIO_HAVE_HUGETLB #define FIO_HAVE_BLKTRACE -#define FIO_HAVE_PSHARED_MUTEX #define FIO_HAVE_CL_SIZE +#define FIO_HAVE_CGROUPS #define FIO_HAVE_FS_STAT #define FIO_HAVE_TRIM #define FIO_HAVE_GETTID @@ -54,22 +59,19 @@ #define MAP_HUGETLB 0x40000 /* arch specific */ #endif - +#ifndef CONFIG_NO_SHM /* - * The Android NDK doesn't currently export , so define the - * necessary stuff here. + * Bionic doesn't support SysV shared memeory, so implement it using ashmem */ - -#include -#define SHM_HUGETLB 04000 - #include #include -#include +#include +#define shmid_ds shmid64_ds +#define SHM_HUGETLB 04000 #define ASHMEM_DEVICE "/dev/ashmem" -static inline int shmctl (int __shmid, int __cmd, struct shmid_ds *__buf) +static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf) { int ret=0; if (__cmd == IPC_RMID) @@ -82,47 +84,50 @@ return ret; } -static inline int shmget (key_t __key, size_t __size, int __shmflg) +static inline int shmget(key_t __key, size_t __size, int __shmflg) { int fd,ret; - char key[11]; - + char keybuf[11]; + fd = open(ASHMEM_DEVICE, O_RDWR); if (fd < 0) return fd; - sprintf(key,"%d",__key); - ret = ioctl(fd, ASHMEM_SET_NAME, key); + sprintf(keybuf,"%d",__key); + ret = ioctl(fd, ASHMEM_SET_NAME, keybuf); if (ret < 0) goto error; - ret = ioctl(fd, ASHMEM_SET_SIZE, __size); + /* Stores size in first 8 bytes, allocate extra space */ + ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t)); if (ret < 0) goto error; return fd; - + error: close(fd); return ret; } -static inline void *shmat (int __shmid, const void *__shmaddr, int __shmflg) +static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg) { - size_t *ptr, size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL); - ptr = mmap(NULL, size + sizeof(size_t), PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0); - *ptr = size; //save size at beginning of buffer, for use with munmap - return &ptr[1]; + size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL); + /* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */ + uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0); + /* Save size at beginning of buffer, for use with munmap */ + *ptr = size; + return ptr + 1; } static inline int shmdt (const void *__shmaddr) { - size_t *ptr, size; - ptr = (size_t *)__shmaddr; - ptr--; - size = *ptr; //find mmap size which we stored at the beginning of the buffer - return munmap((void *)ptr, size + sizeof(size_t)); + /* Find mmap size which we stored at the beginning of the buffer */ + uint64_t *ptr = (uint64_t *)__shmaddr - 1; + size_t size = *ptr; + return munmap(ptr, size); } +#endif #define SPLICE_DEF_SIZE (64*1024) @@ -220,9 +225,19 @@ #define FIO_O_NOATIME 0 #endif -#define fio_swap16(x) __bswap_16(x) -#define fio_swap32(x) __bswap_32(x) -#define fio_swap64(x) __bswap_64(x) +/* Check for GCC or Clang byte swap intrinsics */ +#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \ + && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \ + || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */ +#define fio_swap16(x) __builtin_bswap16(x) +#define fio_swap32(x) __builtin_bswap32(x) +#define fio_swap64(x) __builtin_bswap64(x) +#else +#include +#define fio_swap16(x) bswap_16(x) +#define fio_swap32(x) bswap_32(x) +#define fio_swap64(x) bswap_64(x) +#endif /* fio_swapN */ #define CACHE_LINE_FILE \ "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size" @@ -259,7 +274,7 @@ return ret; } -static inline int os_trim(int fd, unsigned long long start, +static inline int os_trim(struct fio_file *f, unsigned long long start, unsigned long long len) { uint64_t range[2]; @@ -267,7 +282,7 @@ range[0] = start; range[1] = len; - if (!ioctl(fd, BLKDISCARD, range)) + if (!ioctl(f->fd, BLKDISCARD, range)) return 0; return errno; diff -Nru fio-2.16/os/os-dragonfly.h fio-3.1/os/os-dragonfly.h --- fio-2.16/os/os-dragonfly.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-dragonfly.h 2017-09-28 10:23:20.000000000 +0000 @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -24,6 +25,7 @@ #define FIO_HAVE_GETTID #define FIO_HAVE_CPU_AFFINITY #define FIO_HAVE_IOPRIO +#define FIO_HAVE_SHM_ATTACH_REMOVED #define OS_MAP_ANON MAP_ANON @@ -183,7 +185,7 @@ static inline int blockdev_invalidate_cache(struct fio_file *f) { - return EINVAL; + return ENOTSUP; } static inline unsigned long long os_phys_mem(void) @@ -214,7 +216,7 @@ return ret; } -static inline int os_trim(int fd, unsigned long long start, +static inline int os_trim(struct fio_file *f, unsigned long long start, unsigned long long len) { off_t range[2]; @@ -222,7 +224,7 @@ range[0] = start; range[1] = len; - if (!ioctl(fd, IOCTLTRIM, range)) + if (!ioctl(f->fd, IOCTLTRIM, range)) return 0; return errno; @@ -232,4 +234,15 @@ #define FIO_MADV_FREE MADV_FREE #endif +static inline int shm_attach_to_open_removed(void) +{ + int x; + size_t len = sizeof(x); + + if (sysctlbyname("kern.ipc.shm_allow_removed", &x, &len, NULL, 0) < 0) + return 0; + + return x > 0 ? 1 : 0; +} + #endif diff -Nru fio-2.16/os/os-freebsd.h fio-3.1/os/os-freebsd.h --- fio-2.16/os/os-freebsd.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-freebsd.h 2017-09-28 10:23:20.000000000 +0000 @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +23,7 @@ #define FIO_HAVE_TRIM #define FIO_HAVE_GETTID #define FIO_HAVE_CPU_AFFINITY +#define FIO_HAVE_SHM_ATTACH_REMOVED #define OS_MAP_ANON MAP_ANON @@ -81,7 +83,7 @@ static inline int blockdev_invalidate_cache(struct fio_file *f) { - return EINVAL; + return ENOTSUP; } static inline unsigned long long os_phys_mem(void) @@ -115,7 +117,7 @@ return ret; } -static inline int os_trim(int fd, unsigned long long start, +static inline int os_trim(struct fio_file *f, unsigned long long start, unsigned long long len) { off_t range[2]; @@ -123,7 +125,7 @@ range[0] = start; range[1] = len; - if (!ioctl(fd, DIOCGDELETE, range)) + if (!ioctl(f->fd, DIOCGDELETE, range)) return 0; return errno; @@ -133,4 +135,15 @@ #define FIO_MADV_FREE MADV_FREE #endif +static inline int shm_attach_to_open_removed(void) +{ + int x; + size_t len = sizeof(x); + + if (sysctlbyname("kern.ipc.shm_allow_removed", &x, &len, NULL, 0) < 0) + return 0; + + return x > 0 ? 1 : 0; +} + #endif diff -Nru fio-2.16/os/os.h fio-3.1/os/os.h --- fio-2.16/os/os.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os.h 2017-09-28 10:23:20.000000000 +0000 @@ -60,11 +60,6 @@ #endif #endif -#ifdef FIO_HAVE_SGIO -#include -#include -#endif - #ifndef CONFIG_STRSEP #include "../oslib/strsep.h" #endif @@ -81,6 +76,7 @@ #define POSIX_FADV_DONTNEED (0) #define POSIX_FADV_SEQUENTIAL (0) #define POSIX_FADV_RANDOM (0) +#define POSIX_FADV_NORMAL (0) #endif #ifndef FIO_HAVE_CPU_AFFINITY @@ -208,16 +204,20 @@ #ifndef FIO_HAVE_BYTEORDER_FUNCS #ifdef CONFIG_LITTLE_ENDIAN +#define __be64_to_cpu(x) fio_swap64(x) #define __le16_to_cpu(x) (x) #define __le32_to_cpu(x) (x) #define __le64_to_cpu(x) (x) +#define __cpu_to_be64(x) fio_swap64(x) #define __cpu_to_le16(x) (x) #define __cpu_to_le32(x) (x) #define __cpu_to_le64(x) (x) #else +#define __be64_to_cpu(x) (x) #define __le16_to_cpu(x) fio_swap16(x) #define __le32_to_cpu(x) fio_swap32(x) #define __le64_to_cpu(x) fio_swap64(x) +#define __cpu_to_be64(x) (x) #define __cpu_to_le16(x) fio_swap16(x) #define __cpu_to_le32(x) fio_swap32(x) #define __cpu_to_le64(x) fio_swap64(x) @@ -225,6 +225,10 @@ #endif /* FIO_HAVE_BYTEORDER_FUNCS */ #ifdef FIO_INTERNAL +#define be64_to_cpu(val) ({ \ + typecheck(uint64_t, val); \ + __be64_to_cpu(val); \ +}) #define le16_to_cpu(val) ({ \ typecheck(uint16_t, val); \ __le16_to_cpu(val); \ @@ -239,6 +243,10 @@ }) #endif +#define cpu_to_be64(val) ({ \ + typecheck(uint64_t, val); \ + __cpu_to_be64(val); \ +}) #define cpu_to_le16(val) ({ \ typecheck(uint16_t, val); \ __cpu_to_le16(val); \ @@ -252,19 +260,6 @@ __cpu_to_le64(val); \ }) -#ifndef FIO_HAVE_BLKTRACE -static inline int is_blktrace(const char *fname, int *need_swap) -{ - return 0; -} -struct thread_data; -static inline int load_blktrace(struct thread_data *td, const char *fname, - int need_swap) -{ - return 1; -} -#endif - #define FIO_DEF_CL_SIZE 128 static inline int os_cache_line_size(void) @@ -315,12 +310,7 @@ #endif #ifdef FIO_USE_GENERIC_INIT_RANDOM_STATE -extern void td_fill_rand_seeds(struct thread_data *td); -/* - * Initialize the various random states we need (random io, block size ranges, - * read/write mix, etc). - */ -static inline int init_random_state(struct thread_data *td, unsigned long *rand_seeds, int size) +static inline int init_random_seeds(unsigned long *rand_seeds, int size) { int fd; @@ -335,7 +325,6 @@ } close(fd); - td_fill_rand_seeds(td); return 0; } #endif @@ -347,14 +336,6 @@ } #endif -#ifdef __powerpc64__ -#define FIO_HAVE_CPU_ONLINE_SYSCONF -static inline unsigned int cpus_online(void) -{ - return sysconf(_SC_NPROCESSORS_CONF); -} -#endif - #ifndef FIO_HAVE_CPU_ONLINE_SYSCONF static inline unsigned int cpus_online(void) { @@ -385,4 +366,23 @@ } #endif +#ifndef FIO_HAVE_SHM_ATTACH_REMOVED +static inline int shm_attach_to_open_removed(void) +{ + return 0; +} +#endif + +#ifndef FIO_HAVE_NATIVE_FALLOCATE +static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t len) +{ + errno = ENOSYS; + return false; +} +#endif + +#if defined(CONFIG_POSIX_FALLOCATE) || defined(FIO_HAVE_NATIVE_FALLOCATE) +# define FIO_HAVE_ANY_FALLOCATE +#endif + #endif diff -Nru fio-2.16/os/os-hpux.h fio-3.1/os/os-hpux.h --- fio-2.16/os/os-hpux.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-hpux.h 2017-09-28 10:23:20.000000000 +0000 @@ -22,7 +22,6 @@ #define FIO_HAVE_ODIRECT #define FIO_USE_GENERIC_RAND #define FIO_USE_GENERIC_INIT_RANDOM_STATE -#define FIO_HAVE_PSHARED_MUTEX #define FIO_HAVE_CHARDEV_SIZE #define OS_MAP_ANON MAP_ANONYMOUS @@ -44,7 +43,7 @@ static inline int blockdev_invalidate_cache(struct fio_file *f) { - return EINVAL; + return ENOTSUP; } static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes) diff -Nru fio-2.16/os/os-linux.h fio-3.1/os/os-linux.h --- fio-2.16/os/os-linux.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-linux.h 2017-09-28 10:23:20.000000000 +0000 @@ -16,12 +16,17 @@ #include #include #include -#include +#include +#include #include "./os-linux-syscall.h" #include "binject.h" #include "../file.h" +#ifndef __has_builtin // Optional of course. + #define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#endif + #define FIO_HAVE_CPU_AFFINITY #define FIO_HAVE_DISK_UTIL #define FIO_HAVE_SGIO @@ -32,7 +37,6 @@ #define FIO_HAVE_HUGETLB #define FIO_HAVE_RAWBIND #define FIO_HAVE_BLKTRACE -#define FIO_HAVE_PSHARED_MUTEX #define FIO_HAVE_CL_SIZE #define FIO_HAVE_CGROUPS #define FIO_HAVE_FS_STAT @@ -41,6 +45,7 @@ #define FIO_HAVE_GETTID #define FIO_USE_GENERIC_INIT_RANDOM_STATE #define FIO_HAVE_PWRITEV2 +#define FIO_HAVE_SHM_ATTACH_REMOVED #ifdef MAP_HUGETLB #define FIO_HAVE_MMAP_HUGE @@ -219,21 +224,19 @@ #define FIO_MADV_FREE MADV_REMOVE #endif -#if defined(__builtin_bswap16) +/* Check for GCC or Clang byte swap intrinsics */ +#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \ + && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \ + || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */ #define fio_swap16(x) __builtin_bswap16(x) -#else -#define fio_swap16(x) __bswap_16(x) -#endif -#if defined(__builtin_bswap32) #define fio_swap32(x) __builtin_bswap32(x) -#else -#define fio_swap32(x) __bswap_32(x) -#endif -#if defined(__builtin_bswap64) #define fio_swap64(x) __builtin_bswap64(x) #else -#define fio_swap64(x) __bswap_64(x) -#endif +#include +#define fio_swap16(x) bswap_16(x) +#define fio_swap32(x) bswap_32(x) +#define fio_swap64(x) bswap_64(x) +#endif /* fio_swapN */ #define CACHE_LINE_FILE \ "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size" @@ -257,6 +260,14 @@ return atoi(size); } +#ifdef __powerpc64__ +#define FIO_HAVE_CPU_ONLINE_SYSCONF +static inline unsigned int cpus_online(void) +{ + return sysconf(_SC_NPROCESSORS_CONF); +} +#endif + static inline unsigned long long get_fs_free_size(const char *path) { unsigned long long ret; @@ -270,7 +281,7 @@ return ret; } -static inline int os_trim(int fd, unsigned long long start, +static inline int os_trim(struct fio_file *f, unsigned long long start, unsigned long long len) { uint64_t range[2]; @@ -278,7 +289,7 @@ range[0] = start; range[1] = len; - if (!ioctl(fd, BLKDISCARD, range)) + if (!ioctl(f->fd, BLKDISCARD, range)) return 0; return errno; @@ -292,11 +303,26 @@ } #endif -#ifndef POSIX_FADV_STREAMID -#define POSIX_FADV_STREAMID 8 +#ifndef F_GET_RW_HINT +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) +#endif + +#ifndef RWH_WRITE_LIFE_NONE +#define RWH_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 #endif -#define FIO_HAVE_STREAMID +#define FIO_HAVE_WRITE_HINT #ifndef RWF_HIPRI #define RWF_HIPRI 0x00000001 @@ -308,14 +334,26 @@ #define RWF_SYNC 0x00000004 #endif +#ifndef RWF_WRITE_LIFE_SHIFT +#define RWF_WRITE_LIFE_SHIFT 4 +#define RWF_WRITE_LIFE_SHORT (1 << RWF_WRITE_LIFE_SHIFT) +#define RWF_WRITE_LIFE_MEDIUM (2 << RWF_WRITE_LIFE_SHIFT) +#define RWF_WRITE_LIFE_LONG (3 << RWF_WRITE_LIFE_SHIFT) +#define RWF_WRITE_LIFE_EXTREME (4 << RWF_WRITE_LIFE_SHIFT) +#endif + #ifndef CONFIG_PWRITEV2 #ifdef __NR_preadv2 static inline void make_pos_h_l(unsigned long *pos_h, unsigned long *pos_l, off_t offset) { +#if BITS_PER_LONG == 64 + *pos_l = offset; + *pos_h = 0; +#else *pos_l = offset & 0xffffffff; *pos_h = ((uint64_t) offset) >> 32; - +#endif } static inline ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, unsigned int flags) @@ -349,4 +387,27 @@ #endif /* __NR_preadv2 */ #endif /* CONFIG_PWRITEV2 */ +static inline int shm_attach_to_open_removed(void) +{ + return 1; +} + +#ifdef CONFIG_LINUX_FALLOCATE +#define FIO_HAVE_NATIVE_FALLOCATE +static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, + uint64_t len) +{ + int ret; + ret = fallocate(f->fd, 0, 0, len); + if (ret == 0) + return true; + + /* Work around buggy old glibc versions... */ + if (ret > 0) + errno = ret; + + return false; +} +#endif + #endif diff -Nru fio-2.16/os/os-mac.h fio-3.1/os/os-mac.h --- fio-2.16/os/os-mac.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-mac.h 2017-09-28 10:23:20.000000000 +0000 @@ -20,6 +20,7 @@ #define FIO_USE_GENERIC_INIT_RANDOM_STATE #define FIO_HAVE_GETTID #define FIO_HAVE_CHARDEV_SIZE +#define FIO_HAVE_NATIVE_FALLOCATE #define OS_MAP_ANON MAP_ANON @@ -40,9 +41,9 @@ #endif #define FIO_OS_DIRECTIO -static inline int fio_set_odirect(int fd) +static inline int fio_set_odirect(struct fio_file *f) { - if (fcntl(fd, F_NOCACHE, 1) == -1) + if (fcntl(f->fd, F_NOCACHE, 1) == -1) return errno; return 0; } @@ -77,7 +78,7 @@ static inline int blockdev_invalidate_cache(struct fio_file *f) { - return EINVAL; + return ENOTSUP; } static inline unsigned long long os_phys_mem(void) @@ -101,4 +102,15 @@ */ extern int fdatasync(int fd); +static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t len) +{ + fstore_t store = {F_ALLOCATEALL, F_PEOFPOSMODE, offset, len}; + if (fcntl(f->fd, F_PREALLOCATE, &store) != -1) { + if (ftruncate(f->fd, len) == 0) + return true; + } + + return false; +} + #endif diff -Nru fio-2.16/os/os-netbsd.h fio-3.1/os/os-netbsd.h --- fio-2.16/os/os-netbsd.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-netbsd.h 2017-09-28 10:23:20.000000000 +0000 @@ -10,9 +10,10 @@ #include #include #include -/* XXX hack to avoid confilcts between rbtree.h and */ -#define rb_node _rb_node +#include #include + +/* XXX hack to avoid confilcts between rbtree.h and */ #undef rb_node #undef rb_left #undef rb_right @@ -25,8 +26,6 @@ #define FIO_HAVE_FS_STAT #define FIO_HAVE_GETTID -#undef FIO_HAVE_CPU_AFFINITY /* XXX notyet */ - #define OS_MAP_ANON MAP_ANON #ifndef PTHREAD_STACK_MIN @@ -54,7 +53,7 @@ static inline int blockdev_invalidate_cache(struct fio_file *f) { - return EINVAL; + return ENOTSUP; } static inline unsigned long long os_phys_mem(void) diff -Nru fio-2.16/os/os-openbsd.h fio-3.1/os/os-openbsd.h --- fio-2.16/os/os-openbsd.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-openbsd.h 2017-09-28 10:23:20.000000000 +0000 @@ -9,21 +9,22 @@ #include #include #include -/* XXX hack to avoid conflicts between rbtree.h and */ +#include +#include #include + +/* XXX hack to avoid conflicts between rbtree.h and */ #undef RB_BLACK #undef RB_RED #undef RB_ROOT #include "../file.h" -#undef FIO_HAVE_ODIRECT #define FIO_USE_GENERIC_RAND #define FIO_USE_GENERIC_INIT_RANDOM_STATE #define FIO_HAVE_FS_STAT #define FIO_HAVE_GETTID - -#undef FIO_HAVE_CPU_AFFINITY /* XXX notyet */ +#define FIO_HAVE_SHM_ATTACH_REMOVED #define OS_MAP_ANON MAP_ANON @@ -52,7 +53,7 @@ static inline int blockdev_invalidate_cache(struct fio_file *f) { - return EINVAL; + return ENOTSUP; } static inline unsigned long long os_phys_mem(void) @@ -67,7 +68,7 @@ static inline int gettid(void) { - return (int) pthread_self(); + return (int)(intptr_t) pthread_self(); } static inline unsigned long long get_fs_free_size(const char *path) @@ -87,4 +88,34 @@ #define FIO_MADV_FREE MADV_FREE #endif +static inline int shm_attach_to_open_removed(void) +{ + struct utsname uts; + int major, minor; + + if (uname(&uts) == -1) + return 0; + + /* + * Return 1 if >= OpenBSD 5.1 according to 97900ebf, + * assuming both major/minor versions are < 10. + */ + if (uts.release[0] > '9' || uts.release[0] < '0') + return 0; + if (uts.release[1] != '.') + return 0; + if (uts.release[2] > '9' || uts.release[2] < '0') + return 0; + + major = uts.release[0] - '0'; + minor = uts.release[2] - '0'; + + if (major > 5) + return 1; + if (major == 5 && minor >= 1) + return 1; + + return 0; +} + #endif diff -Nru fio-2.16/os/os-solaris.h fio-3.1/os/os-solaris.h --- fio-2.16/os/os-solaris.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-solaris.h 2017-09-28 10:23:20.000000000 +0000 @@ -16,7 +16,6 @@ #include "../file.h" #define FIO_HAVE_CPU_AFFINITY -#define FIO_HAVE_PSHARED_MUTEX #define FIO_HAVE_CHARDEV_SIZE #define FIO_USE_GENERIC_BDEV_SIZE #define FIO_USE_GENERIC_INIT_RANDOM_STATE @@ -61,7 +60,7 @@ static inline int blockdev_invalidate_cache(struct fio_file *f) { - return 0; + return ENOTSUP; } static inline unsigned long long os_phys_mem(void) @@ -86,9 +85,9 @@ #define FIO_OS_DIRECTIO extern int directio(int, int); -static inline int fio_set_odirect(int fd) +static inline int fio_set_odirect(struct fio_file *f) { - if (directio(fd, DIRECTIO_ON) < 0) + if (directio(f->fd, DIRECTIO_ON) < 0) return errno; return 0; @@ -98,7 +97,7 @@ * pset binding hooks for fio */ #define fio_setaffinity(pid, cpumask) \ - pset_bind((cpumask), P_PID, (pid), NULL) + pset_bind((cpumask), P_LWPID, (pid), NULL) #define fio_getaffinity(pid, ptr) ({ 0; }) #define fio_cpu_clear(mask, cpu) pset_assign(PS_NONE, (cpu), NULL) diff -Nru fio-2.16/os/os-windows.h fio-3.1/os/os-windows.h --- fio-2.16/os/os-windows.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/os-windows.h 2017-09-28 10:23:20.000000000 +0000 @@ -116,7 +116,6 @@ ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset); ssize_t pwrite(int fildes, const void *buf, size_t nbyte, off_t offset); -extern void td_fill_rand_seeds(struct thread_data *); static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes) { @@ -152,9 +151,7 @@ static inline int blockdev_invalidate_cache(struct fio_file *f) { - /* There's no way to invalidate the cache in Windows - * so just pretend to succeed */ - return 0; + return ENOTSUP; } static inline unsigned long long os_phys_mem(void) @@ -241,7 +238,7 @@ return 0; } -static inline int init_random_state(struct thread_data *td, unsigned long *rand_seeds, int size) +static inline int init_random_seeds(unsigned long *rand_seeds, int size) { HCRYPTPROV hCryptProv; @@ -260,7 +257,6 @@ } CryptReleaseContext(hCryptProv, 0); - td_fill_rand_seeds(td); return 0; } Binary files /tmp/tmptaLXeb/IdYq6qTbHH/fio-2.16/os/windows/eula.rtf and /tmp/tmptaLXeb/svYbzIcWYm/fio-3.1/os/windows/eula.rtf differ diff -Nru fio-2.16/os/windows/examples.wxs fio-3.1/os/windows/examples.wxs --- fio-2.16/os/windows/examples.wxs 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/windows/examples.wxs 2017-09-28 10:23:20.000000000 +0000 @@ -9,48 +9,111 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + @@ -59,20 +122,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - + diff -Nru fio-2.16/os/windows/install.wxs fio-3.1/os/windows/install.wxs --- fio-2.16/os/windows/install.wxs 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/windows/install.wxs 2017-09-28 10:23:20.000000000 +0000 @@ -10,7 +10,7 @@ + UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="3.1"> - + fio@vger.kernel.org http://www.spinics.net/lists/fio/ http://bluestop.org/fio/ diff -Nru fio-2.16/os/windows/posix.c fio-3.1/os/windows/posix.c --- fio-2.16/os/windows/posix.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/os/windows/posix.c 2017-09-28 10:23:20.000000000 +0000 @@ -25,8 +25,8 @@ #include "../os-windows.h" #include "../../lib/hweight.h" -extern unsigned long mtime_since_now(struct timeval *); -extern void fio_gettime(struct timeval *, void *); +extern unsigned long mtime_since_now(struct timespec *); +extern void fio_gettime(struct timespec *, void *); /* These aren't defined in the MinGW headers */ HRESULT WINAPI StringCchCopyA( @@ -40,12 +40,6 @@ const char *pszFormat, ...); -int vsprintf_s( - char *buffer, - size_t numberOfElements, - const char *format, - va_list argptr); - int win_to_posix_error(DWORD winerr) { switch (winerr) @@ -304,35 +298,76 @@ int fildes, off_t off) { DWORD vaProt = 0; + DWORD mapAccess = 0; + DWORD lenlow; + DWORD lenhigh; + HANDLE hMap; void* allocAddr = NULL; if (prot & PROT_NONE) vaProt |= PAGE_NOACCESS; - if ((prot & PROT_READ) && !(prot & PROT_WRITE)) + if ((prot & PROT_READ) && !(prot & PROT_WRITE)) { vaProt |= PAGE_READONLY; + mapAccess = FILE_MAP_READ; + } - if (prot & PROT_WRITE) + if (prot & PROT_WRITE) { vaProt |= PAGE_READWRITE; + mapAccess |= FILE_MAP_WRITE; + } + + lenlow = len & 0xFFFF; + lenhigh = len >> 16; + /* If the low DWORD is zero and the high DWORD is non-zero, `CreateFileMapping` + will return ERROR_INVALID_PARAMETER. To avoid this, set both to zero. */ + if (lenlow == 0) { + lenhigh = 0; + } - if ((flags & MAP_ANON) | (flags & MAP_ANONYMOUS)) + if (flags & MAP_ANON || flags & MAP_ANONYMOUS) { allocAddr = VirtualAlloc(addr, len, MEM_COMMIT, vaProt); if (allocAddr == NULL) errno = win_to_posix_error(GetLastError()); } + else + { + hMap = CreateFileMapping((HANDLE)_get_osfhandle(fildes), NULL, vaProt, lenhigh, lenlow, NULL); + + if (hMap != NULL) + { + allocAddr = MapViewOfFile(hMap, mapAccess, off >> 16, off & 0xFFFF, len); + } + + if (hMap == NULL || allocAddr == NULL) + errno = win_to_posix_error(GetLastError()); + + } return allocAddr; } int munmap(void *addr, size_t len) { - if (!VirtualFree(addr, 0, MEM_RELEASE)) { - errno = win_to_posix_error(GetLastError()); - return -1; + BOOL success; + + /* We may have allocated the memory with either MapViewOfFile or + VirtualAlloc. Therefore, try calling UnmapViewOfFile first, and if that + fails, call VirtualFree. */ + success = UnmapViewOfFile(addr); + + if (!success) + { + success = VirtualFree(addr, 0, MEM_RELEASE); } - return 0; + return !success; +} + +int msync(void *addr, size_t len, int flags) +{ + return !FlushViewOfFile(addr, len); } int fork(void) @@ -549,7 +584,8 @@ while (path[i] != '\\' && path[i] != '/' && i >= 0) i--; - strncpy(name, path + i + 1, MAX_PATH); + name[MAX_PATH - 1] = '\0'; + strncpy(name, path + i + 1, MAX_PATH - 1); return name; } @@ -702,17 +738,9 @@ int posix_madvise(void *addr, size_t len, int advice) { - log_err("%s is not implemented\n", __func__); return ENOSYS; } -/* Windows doesn't support advice for memory pages. Just ignore it. */ -int msync(void *addr, size_t len, int flags) -{ - errno = ENOSYS; - return -1; -} - int fdatasync(int fildes) { return fsync(fildes); @@ -825,7 +853,7 @@ int nanosleep(const struct timespec *rqtp, struct timespec *rmtp) { - struct timeval tv; + struct timespec tv; DWORD ms_remaining; DWORD ms_total = (rqtp->tv_sec * 1000) + (rqtp->tv_nsec / 1000000.0); diff -Nru fio-2.16/oslib/libmtd_common.h fio-3.1/oslib/libmtd_common.h --- fio-2.16/oslib/libmtd_common.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/oslib/libmtd_common.h 2017-09-28 10:23:20.000000000 +0000 @@ -119,57 +119,6 @@ fprintf(stderr, "%s: warning!: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__); \ } while(0) -#if defined(__UCLIBC__) -/* uClibc versions before 0.9.34 don't have rpmatch() */ -#if __UCLIBC_MAJOR__ == 0 && \ - (__UCLIBC_MINOR__ < 9 || \ - (__UCLIBC_MINOR__ == 9 && __UCLIBC_SUBLEVEL__ < 34)) -#undef rpmatch -#define rpmatch __rpmatch -static inline int __rpmatch(const char *resp) -{ - return (resp[0] == 'y' || resp[0] == 'Y') ? 1 : - (resp[0] == 'n' || resp[0] == 'N') ? 0 : -1; -} -#endif -#endif - -/** - * prompt the user for confirmation - */ -static inline bool prompt(const char *msg, bool def) -{ - char *line = NULL; - size_t len; - bool ret = def; - - do { - normsg_cont("%s (%c/%c) ", msg, def ? 'Y' : 'y', def ? 'n' : 'N'); - fflush(stdout); - - while (getline(&line, &len, stdin) == -1) { - printf("failed to read prompt; assuming '%s'\n", - def ? "yes" : "no"); - break; - } - - if (strcmp("\n", line) != 0) { - switch (rpmatch(line)) { - case 0: ret = false; break; - case 1: ret = true; break; - case -1: - puts("unknown response; please try again"); - continue; - } - } - break; - } while (1); - - free(line); - - return ret; -} - static inline int is_power_of_2(unsigned long long n) { return (n != 0 && ((n & (n - 1)) == 0)); diff -Nru fio-2.16/oslib/linux-dev-lookup.c fio-3.1/oslib/linux-dev-lookup.c --- fio-2.16/oslib/linux-dev-lookup.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/oslib/linux-dev-lookup.c 2017-09-28 10:23:20.000000000 +0000 @@ -1,12 +1,12 @@ #include #include +#include #include #include #include #include -#include "../os/os.h" -#include "oslib/linux-dev-lookup.h" +#include "linux-dev-lookup.h" int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj, unsigned int min) @@ -21,7 +21,7 @@ return 0; while ((dir = readdir(D)) != NULL) { - char full_path[256]; + char full_path[257]; if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, "..")) continue; diff -Nru fio-2.16/oslib/strlcat.c fio-3.1/oslib/strlcat.c --- fio-2.16/oslib/strlcat.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/oslib/strlcat.c 2017-09-28 10:23:20.000000000 +0000 @@ -1,5 +1,5 @@ #include -#include "oslib/strlcat.h" +#include "strlcat.h" size_t strlcat(char *dst, const char *src, size_t size) { diff -Nru fio-2.16/oslib/strndup.c fio-3.1/oslib/strndup.c --- fio-2.16/oslib/strndup.c 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/oslib/strndup.c 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,18 @@ +#include +#include "strndup.h" + +#ifndef CONFIG_HAVE_STRNDUP + +char *strndup(const char *s, size_t n) +{ + char *str = malloc(n + 1); + + if (str) { + strncpy(str, s, n); + str[n] = '\0'; + } + + return str; +} + +#endif diff -Nru fio-2.16/oslib/strndup.h fio-3.1/oslib/strndup.h --- fio-2.16/oslib/strndup.h 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/oslib/strndup.h 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,7 @@ +#include + +#ifndef CONFIG_HAVE_STRNDUP + +char *strndup(const char *s, size_t n); + +#endif diff -Nru fio-2.16/parse.c fio-3.1/parse.c --- fio-2.16/parse.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/parse.c 2017-09-28 10:23:20.000000000 +0000 @@ -135,6 +135,7 @@ const char *p = str; char *c; unsigned long long mult = 1; + int i; /* * Go forward until we hit a non-digit, or +/- sign @@ -153,7 +154,7 @@ } c = strdup(p); - for (int i = 0; i < strlen(c); i++) + for (i = 0; i < strlen(c); i++) c[i] = tolower(c[i]); if (!strncmp("us", c, 2) || !strncmp("usec", c, 4)) @@ -167,7 +168,7 @@ else if (!strcmp("h", c)) mult = 60 * 60 * 1000000UL; else if (!strcmp("d", c)) - mult = 24 * 60 * 60 * 1000000UL; + mult = 24 * 60 * 60 * 1000000ULL; free(c); return mult; @@ -207,32 +208,50 @@ } } + /* If kb_base is 1000, use true units. + * If kb_base is 1024, use opposite units. + */ if (!strncmp("pib", c, 3)) { pow = 5; - mult = 1000; + if (kb_base == 1000) + mult = 1024; + else if (kb_base == 1024) + mult = 1000; } else if (!strncmp("tib", c, 3)) { pow = 4; - mult = 1000; + if (kb_base == 1000) + mult = 1024; + else if (kb_base == 1024) + mult = 1000; } else if (!strncmp("gib", c, 3)) { pow = 3; - mult = 1000; + if (kb_base == 1000) + mult = 1024; + else if (kb_base == 1024) + mult = 1000; } else if (!strncmp("mib", c, 3)) { pow = 2; - mult = 1000; + if (kb_base == 1000) + mult = 1024; + else if (kb_base == 1024) + mult = 1000; } else if (!strncmp("kib", c, 3)) { pow = 1; - mult = 1000; - } else if (!strncmp("p", c, 1) || !strncmp("pb", c, 2)) + if (kb_base == 1000) + mult = 1024; + else if (kb_base == 1024) + mult = 1000; + } else if (!strncmp("p", c, 1) || !strncmp("pb", c, 2)) { pow = 5; - else if (!strncmp("t", c, 1) || !strncmp("tb", c, 2)) + } else if (!strncmp("t", c, 1) || !strncmp("tb", c, 2)) { pow = 4; - else if (!strncmp("g", c, 1) || !strncmp("gb", c, 2)) + } else if (!strncmp("g", c, 1) || !strncmp("gb", c, 2)) { pow = 3; - else if (!strncmp("m", c, 1) || !strncmp("mb", c, 2)) + } else if (!strncmp("m", c, 1) || !strncmp("mb", c, 2)) { pow = 2; - else if (!strncmp("k", c, 1) || !strncmp("kb", c, 2)) + } else if (!strncmp("k", c, 1) || !strncmp("kb", c, 2)) { pow = 1; - else if (!strncmp("%", c, 1)) { + } else if (!strncmp("%", c, 1)) { *percent = 1; free(c); return ret; @@ -1301,6 +1320,23 @@ } } +void options_mem_dupe(struct fio_option *options, void *data) +{ + struct fio_option *o; + char **ptr; + + dprint(FD_PARSE, "dup options\n"); + + for (o = &options[0]; o->name; o++) { + if (o->type != FIO_OPT_STR_STORE) + continue; + + ptr = td_var(data, o, o->off1); + if (*ptr) + *ptr = strdup(*ptr); + } +} + void options_free(struct fio_option *options, void *data) { struct fio_option *o; @@ -1309,7 +1345,7 @@ dprint(FD_PARSE, "free options\n"); for (o = &options[0]; o->name; o++) { - if (o->type != FIO_OPT_STR_STORE || !o->off1) + if (o->type != FIO_OPT_STR_STORE || !o->off1 || o->no_free) continue; ptr = td_var(data, o, o->off1); diff -Nru fio-2.16/parse.h fio-3.1/parse.h --- fio-2.16/parse.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/parse.h 2017-09-28 10:23:20.000000000 +0000 @@ -78,6 +78,7 @@ int is_time; /* time based value */ int no_warn_def; int pow2; /* must be a power-of-2 */ + int no_free; }; extern int parse_option(char *, const char *, struct fio_option *, struct fio_option **, void *, struct flist_head *); @@ -86,6 +87,7 @@ extern int show_cmd_help(struct fio_option *, const char *); extern void fill_default_options(void *, struct fio_option *); extern void options_init(struct fio_option *); +extern void options_mem_dupe(struct fio_option *, void *); extern void options_free(struct fio_option *, void *); extern void strip_blank_front(char **); @@ -106,8 +108,7 @@ typedef int (fio_opt_int_fn)(void *, int *); struct thread_options; -static inline void *td_var(struct thread_options *to, struct fio_option *o, - unsigned int offset) +static inline void *td_var(void *to, struct fio_option *o, unsigned int offset) { void *ret; diff -Nru fio-2.16/printing.c fio-3.1/printing.c --- fio-2.16/printing.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/printing.c 2017-09-28 10:23:20.000000000 +0000 @@ -31,7 +31,7 @@ gpointer data) { cairo_t *cr; - char str[20]; + char str[32]; double x, y; cr = gtk_print_context_get_cairo_context(context); diff -Nru fio-2.16/profiles/act.c fio-3.1/profiles/act.c --- fio-2.16/profiles/act.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/profiles/act.c 2017-09-28 10:23:20.000000000 +0000 @@ -47,20 +47,12 @@ static struct act_run_data *act_run_data; struct act_prof_data { - struct timeval sample_tv; + struct timespec sample_tv; struct act_slice *slices; unsigned int cur_slice; unsigned int nr_slices; }; -static char *device_names; -static unsigned int load; -static unsigned int prep; -static unsigned int threads_per_queue; -static unsigned int num_read_blocks; -static unsigned int write_size; -static unsigned long long test_duration; - #define ACT_MAX_OPTS 128 static const char *act_opts[ACT_MAX_OPTS] = { "direct=1", @@ -97,6 +89,7 @@ .help = "Devices to use", .category = FIO_OPT_C_PROFILE, .group = FIO_OPT_G_ACT, + .no_free = true, }, { .name = "load", @@ -130,21 +123,21 @@ }, { .name = "read-req-num-512-blocks", - .lname = "Number of 512b blocks to read", + .lname = "Number of 512B blocks to read", .type = FIO_OPT_INT, .off1 = offsetof(struct act_options, num_read_blocks), - .help = "Number of 512b blocks to read at the time", + .help = "Number of 512B blocks to read at the time", .def = "3", .category = FIO_OPT_C_PROFILE, .group = FIO_OPT_G_ACT, }, { .name = "large-block-op-kbytes", - .lname = "Size of large block ops (writes)", + .lname = "Size of large block ops in KiB (writes)", .type = FIO_OPT_INT, .off1 = offsetof(struct act_options, write_size), - .help = "Size of large block ops (writes)", - .def = "128k", + .help = "Size of large block ops in KiB (writes)", + .def = "131072", .category = FIO_OPT_C_PROFILE, .group = FIO_OPT_G_ACT, }, @@ -185,6 +178,8 @@ static int act_add_rw(const char *dev, int reads) { + struct act_options *ao = &act_options; + if (act_add_opt("name=act-%s-%s", reads ? "read" : "write", dev)) return 1; if (act_add_opt("filename=%s", dev)) @@ -192,21 +187,21 @@ if (act_add_opt("rw=%s", reads ? "randread" : "randwrite")) return 1; if (reads) { - int rload = load * R_LOAD / threads_per_queue; + int rload = ao->load * R_LOAD / ao->threads_per_queue; - if (act_add_opt("numjobs=%u", threads_per_queue)) + if (act_add_opt("numjobs=%u", ao->threads_per_queue)) return 1; if (act_add_opt("rate_iops=%u", rload)) return 1; - if (act_add_opt("bs=%u", num_read_blocks * 512)) + if (act_add_opt("bs=%u", ao->num_read_blocks * 512)) return 1; } else { - const int rsize = write_size / (num_read_blocks * 512); - int wload = (load * W_LOAD + rsize - 1) / rsize; + const int rsize = ao->write_size / (ao->num_read_blocks * 512); + int wload = (ao->load * W_LOAD + rsize - 1) / rsize; if (act_add_opt("rate_iops=%u", wload)) return 1; - if (act_add_opt("bs=%u", write_size)) + if (act_add_opt("bs=%u", ao->write_size)) return 1; } @@ -220,7 +215,7 @@ return 1; if (act_add_opt("filename=%s", dev)) return 1; - if (act_add_opt("bs=1M")) + if (act_add_opt("bs=1048576")) return 1; if (act_add_opt("zero_buffers")) return 1; @@ -234,7 +229,7 @@ return 1; if (act_add_opt("filename=%s", dev)) return 1; - if (act_add_opt("bs=4k")) + if (act_add_opt("bs=4096")) return 1; if (act_add_opt("ioengine=libaio")) return 1; @@ -248,10 +243,10 @@ static int act_add_dev(const char *dev) { - if (prep) + if (act_options.prep) return act_add_dev_prep(dev); - if (act_add_opt("runtime=%llus", test_duration)) + if (act_add_opt("runtime=%llus", act_options.test_duration)) return 1; if (act_add_opt("time_based=1")) return 1; @@ -269,7 +264,7 @@ */ static int act_prep_cmdline(void) { - if (!device_names) { + if (!act_options.device_names) { log_err("act: you need to set IO target(s) with the " "device-names option.\n"); return 1; @@ -280,7 +275,7 @@ do { char *dev; - dev = strsep(&device_names, ","); + dev = strsep(&act_options.device_names, ","); if (!dev) break; @@ -300,7 +295,7 @@ int i, ret = 0; double perm; - if (prep) + if (act_options.prep) return 0; /* @@ -431,7 +426,7 @@ get_act_ref(); apd = calloc(1, sizeof(*apd)); - nr_slices = (test_duration + SAMPLE_SEC - 1) / SAMPLE_SEC; + nr_slices = (act_options.test_duration + SAMPLE_SEC - 1) / SAMPLE_SEC; apd->slices = calloc(nr_slices, sizeof(struct act_slice)); apd->nr_slices = nr_slices; fio_gettime(&apd->sample_tv, NULL); diff -Nru fio-2.16/profiles/tiobench.c fio-3.1/profiles/tiobench.c --- fio-2.16/profiles/tiobench.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/profiles/tiobench.c 2017-09-28 10:23:20.000000000 +0000 @@ -39,7 +39,7 @@ .lname = "Tiobench size", .type = FIO_OPT_STR_VAL, .off1 = offsetof(struct tiobench_options, size), - .help = "Size in MB", + .help = "Size in MiB", .category = FIO_OPT_C_PROFILE, .group = FIO_OPT_G_TIOBENCH, }, @@ -49,7 +49,7 @@ .type = FIO_OPT_INT, .off1 = offsetof(struct tiobench_options, bs), .help = "Block size in bytes", - .def = "4k", + .def = "4096", .category = FIO_OPT_C_PROFILE, .group = FIO_OPT_G_TIOBENCH, }, @@ -70,6 +70,7 @@ .help = "Test directory", .category = FIO_OPT_C_PROFILE, .group = FIO_OPT_G_TIOBENCH, + .no_free = true, }, { .name = "threads", @@ -91,7 +92,7 @@ static int tb_prep_cmdline(void) { /* - * tiobench uses size as MB, so multiply up + * tiobench uses size as MiB, so multiply up */ size *= 1024 * 1024ULL; if (size) diff -Nru fio-2.16/rate-submit.c fio-3.1/rate-submit.c --- fio-2.16/rate-submit.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/rate-submit.c 2017-09-28 10:23:20.000000000 +0000 @@ -5,7 +5,7 @@ * */ #include "fio.h" -#include "ioengine.h" +#include "ioengines.h" #include "lib/getrusage.h" #include "rate-submit.h" @@ -98,7 +98,6 @@ { struct thread_data *parent = sw->wq->td; struct thread_data *td = sw->priv; - int fio_unused ret; memcpy(&td->o, &parent->o, sizeof(td->o)); memcpy(&td->ts, &parent->ts, sizeof(td->ts)); diff -Nru fio-2.16/README fio-3.1/README --- fio-2.16/README 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/README 2017-09-28 10:23:20.000000000 +0000 @@ -1,18 +1,31 @@ -fio ---- +Overview and history +-------------------- -fio is a tool that will spawn a number of threads or processes doing a -particular type of io action as specified by the user. fio takes a -number of global parameters, each inherited by the thread unless -otherwise parameters given to them overriding that setting is given. -The typical use of fio is to write a job file matching the io load -one wants to simulate. +Fio was originally written to save me the hassle of writing special test case +programs when I wanted to test a specific workload, either for performance +reasons or to find/reproduce a bug. The process of writing such a test app can +be tiresome, especially if you have to do it often. Hence I needed a tool that +would be able to simulate a given I/O workload without resorting to writing a +tailored test case again and again. + +A test work load is difficult to define, though. There can be any number of +processes or threads involved, and they can each be using their own way of +generating I/O. You could have someone dirtying large amounts of memory in an +memory mapped file, or maybe several threads issuing reads using asynchronous +I/O. fio needed to be flexible enough to simulate both of these cases, and many +more. + +Fio spawns a number of threads or processes doing a particular type of I/O +action as specified by the user. fio takes a number of global parameters, each +inherited by the thread unless otherwise parameters given to them overriding +that setting is given. The typical use of fio is to write a job file matching +the I/O load one wants to simulate. Source ------ -fio resides in a git repo, the canonical place is: +Fio resides in a git repo, the canonical place is: git://git.kernel.dk/fio.git @@ -21,63 +34,37 @@ http://git.kernel.dk/fio.git -Snapshots are frequently generated and include the git meta data as well. +Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git +meta data as well. Other tarballs are archives of official fio releases. Snapshots can download from: http://brick.kernel.dk/snaps/ -There are also two official mirrors. Both of these are automatically synced -with the main repository, when changes are pushed. If the main repo is down -for some reason, either one of these is safe to use as a backup: +There are also two official mirrors. Both of these are automatically synced with +the main repository, when changes are pushed. If the main repo is down for some +reason, either one of these is safe to use as a backup: git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git + https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git or git://github.com/axboe/fio.git - https://github.com/axboe/fio.git - - -Binary packages ---------------- - -Debian: -Starting with Debian "Squeeze", fio packages are part of the official -Debian repository. http://packages.debian.org/search?keywords=fio - -Ubuntu: -Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part -of the Ubuntu "universe" repository. -http://packages.ubuntu.com/search?keywords=fio - -Red Hat, CentOS & Co: -Dag Wieërs has RPMs for Red Hat related distros, find them here: -http://dag.wieers.com/rpm/packages/fio/ -Mandriva: -Mandriva has integrated fio into their package repository, so installing -on that distro should be as easy as typing 'urpmi fio'. - -Solaris: -Packages for Solaris are available from OpenCSW. Install their pkgutil -tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via -'pkgutil -i fio'. - -Windows: -Bruce Cran has fio packages for Windows at -http://www.bluestop.org/fio/ . + https://github.com/axboe/fio.git Mailing list ------------ The fio project mailing list is meant for anything related to fio including -general discussion, bug reporting, questions, and development. +general discussion, bug reporting, questions, and development. For bug reporting, +see REPORTING-BUGS. -An automated mail detailing recent commits is automatically sent to the -list at most daily. The list address is fio@vger.kernel.org, subscribe -by sending an email to majordomo@vger.kernel.org with +An automated mail detailing recent commits is automatically sent to the list at +most daily. The list address is fio@vger.kernel.org, subscribe by sending an +email to majordomo@vger.kernel.org with subscribe fio @@ -90,260 +77,126 @@ http://maillist.kernel.dk/fio-devel/ -Building --------- - -Just type 'configure', 'make' and 'make install'. - -Note that GNU make is required. On BSD it's available from devel/gmake; -on Solaris it's in the SUNWgmake package. On platforms where GNU make -isn't the default, type 'gmake' instead of 'make'. - -Configure will print the enabled options. Note that on Linux based -platforms, the libaio development packages must be installed to use -the libaio engine. Depending on distro, it is usually called -libaio-devel or libaio-dev. - -For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required -to be installed. gfio isn't built automatically and can be enabled -with a --enable-gfio option to configure. - -To build FIO with a cross-compiler: - $ make clean - $ make CROSS_COMPILE=/path/to/toolchain/prefix -Configure will attempt to determine the target platform automatically. - -It's possible to build fio for ESX as well, use the --esx switch to -configure. +Author +------ +Fio was written by Jens Axboe to enable flexible testing of +the Linux I/O subsystem and schedulers. He got tired of writing specific test +applications to simulate a given workload, and found that the existing I/O +benchmark/test tools out there weren't flexible enough to do what he wanted. -Windows -------- +Jens Axboe 20060905 -On Windows, Cygwin (http://www.cygwin.com/) is required in order to -build fio. To create an MSI installer package install WiX 3.8 from -http://wixtoolset.org and run dobuild.cmd from the -os/windows directory. -How to compile fio on 64-bit Windows: +Binary packages +--------------- - 1. Install Cygwin (http://www.cygwin.com/). Install 'make' and all - packages starting with 'mingw64-i686' and 'mingw64-x86_64'. - 2. Open the Cygwin Terminal. - 3. Go to the fio directory (source files). - 4. Run 'make clean && make -j'. +Debian: + Starting with Debian "Squeeze", fio packages are part of the official + Debian repository. http://packages.debian.org/search?keywords=fio . -To build fio on 32-bit Windows, run './configure --build-32bit-win' before 'make'. +Ubuntu: + Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part + of the Ubuntu "universe" repository. + http://packages.ubuntu.com/search?keywords=fio . + +Red Hat, Fedora, CentOS & Co: + Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio + packages are part of the Fedora/EPEL repositories. + https://apps.fedoraproject.org/packages/fio . -It's recommended that once built or installed, fio be run in a Command Prompt -or other 'native' console such as console2, since there are known to be display -and signal issues when running it under a Cygwin shell -(see http://code.google.com/p/mintty/issues/detail?id=56 for details). +Mandriva: + Mandriva has integrated fio into their package repository, so installing + on that distro should be as easy as typing ``urpmi fio``. +Arch Linux: + An Arch Linux package is provided under the Community sub-repository: + https://www.archlinux.org/packages/?sort=&q=fio -Command line ------------- +Solaris: + Packages for Solaris are available from OpenCSW. Install their pkgutil + tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via + ``pkgutil -i fio``. -$ fio - --debug Enable some debugging options (see below) - --parse-only Parse options only, don't start any IO - --output Write output to file - --runtime Runtime in seconds - --bandwidth-log Generate aggregate bandwidth logs - --minimal Minimal (terse) output - --output-format=type Output format (terse,json,json+,normal) - --terse-version=type Terse version output format (default 3, or 2 or 4). - --version Print version info and exit - --help Print this page - --cpuclock-test Perform test/validation of CPU clock - --crctest[=test] Test speed of checksum functions - --cmdhelp=cmd Print command help, "all" for all of them - --enghelp=engine Print ioengine help, or list available ioengines - --enghelp=engine,cmd Print help for an ioengine cmd - --showcmd Turn a job file into command line options - --readonly Turn on safety read-only checks, preventing - writes - --eta=when When ETA estimate should be printed - May be "always", "never" or "auto" - --eta-newline=time Force a new line for every 'time' period passed - --status-interval=t Force full status dump every 't' period passed - --section=name Only run specified section in job file. - Multiple sections can be specified. - --alloc-size=kb Set smalloc pool to this size in kb (def 16384) - --warnings-fatal Fio parser warnings are fatal - --max-jobs Maximum number of threads/processes to support - --server=args Start backend server. See Client/Server section. - --client=host Connect to specified backend(s). - --remote-config=file Tell fio server to load this local file - --idle-prof=option Report cpu idleness on a system or percpu basis - (option=system,percpu) or run unit work - calibration only (option=calibrate). - --inflate-log=log Inflate and output compressed log - --trigger-file=file Execute trigger cmd when file exists - --trigger-timeout=t Execute trigger af this time - --trigger=cmd Set this command as local trigger - --trigger-remote=cmd Set this command as remote trigger - --aux-path=path Use this path for fio state generated files - - -Any parameters following the options will be assumed to be job files, -unless they match a job file parameter. Multiple job files can be listed -and each job file will be regarded as a separate group. fio will stonewall -execution between each group. - -The --readonly option is an extra safety guard to prevent users from -accidentally starting a write workload when that is not desired. Fio -will only write if rw=write/randwrite/rw/randrw is given. This extra -safety net can be used as an extra precaution as --readonly will also -enable a write check in the io engine core to prevent writes due to -unknown user space bug(s). - -The --debug option triggers additional logging by fio. -Currently, additional logging is available for: - - process Dump info related to processes - file Dump info related to file actions - io Dump info related to IO queuing - mem Dump info related to memory allocations - blktrace Dump info related to blktrace setup - verify Dump info related to IO verification - all Enable all debug options - random Dump info related to random offset generation - parse Dump info related to option matching and parsing - diskutil Dump info related to disk utilization updates - job:x Dump info only related to job number x - mutex Dump info only related to mutex up/down ops - profile Dump info related to profile extensions - time Dump info related to internal time keeping - net Dump info related to networking connections - rate Dump info related to IO rate switching - compress Dump info related to log compress/decompress - ? or help Show available debug options. - -One can specify multiple debug options: e.g. --debug=file,mem will enable -file and memory debugging. - -The --section option allows one to combine related jobs into one file. -E.g. one job file could define light, moderate, and heavy sections. Tell fio to -run only the "heavy" section by giving --section=heavy command line option. -One can also specify the "write" operations in one section and "verify" -operation in another section. The --section option only applies to job -sections. The reserved 'global' section is always parsed and used. - -The --alloc-size switch allows one to use a larger pool size for smalloc. -If running large jobs with randommap enabled, fio can run out of memory. -Smalloc is an internal allocator for shared structures from a fixed size -memory pool. The pool size defaults to 16M and can grow to 8 pools. +Windows: + Rebecca Cran has fio packages for Windows at + http://www.bluestop.org/fio/ . -NOTE: While running .fio_smalloc.* backing store files are visible in /tmp. +BSDs: + Packages for BSDs may be available from their binary package repositories. + Look for a package "fio" using their binary package managers. -Job file +Building -------- -See the HOWTO file for a complete description of job file syntax and -parameters. The --cmdhelp option also lists all options. If used with -an option argument, --cmdhelp will detail the given option. The job file -format is in the ini style format, as that is easy for the user to review -and modify. - -This README contains the terse version. Job files can describe big and -complex setups that are not possible with the command line. Job files -are a good practice even for simple jobs since the file provides an -easily accessed record of the workload and can include comments. - -See the examples/ directory for inspiration on how to write job files. Note -the copyright and license requirements currently apply to examples/ files. - - -Client/server ------------- - -Normally fio is invoked as a stand-alone application on the machine -where the IO workload should be generated. However, the frontend and -backend of fio can be run separately. Ie the fio server can generate -an IO workload on the "Device Under Test" while being controlled from -another machine. - -Start the server on the machine which has access to the storage DUT: - -fio --server=args - -where args defines what fio listens to. The arguments are of the form -'type,hostname or IP,port'. 'type' is either 'ip' (or ip4) for TCP/IP v4, -'ip6' for TCP/IP v6, or 'sock' for a local unix domain socket. -'hostname' is either a hostname or IP address, and 'port' is the port to -listen to (only valid for TCP/IP, not a local socket). Some examples: - -1) fio --server - - Start a fio server, listening on all interfaces on the default port (8765). - -2) fio --server=ip:hostname,4444 - - Start a fio server, listening on IP belonging to hostname and on port 4444. - -3) fio --server=ip6:::1,4444 - - Start a fio server, listening on IPv6 localhost ::1 and on port 4444. - -4) fio --server=,4444 +Just type:: - Start a fio server, listening on all interfaces on port 4444. + $ ./configure + $ make + $ make install + +Note that GNU make is required. On BSDs it's available from devel/gmake within +ports directory; on Solaris it's in the SUNWgmake package. On platforms where +GNU make isn't the default, type ``gmake`` instead of ``make``. + +Configure will print the enabled options. Note that on Linux based platforms, +the libaio development packages must be installed to use the libaio +engine. Depending on distro, it is usually called libaio-devel or libaio-dev. -5) fio --server=1.2.3.4 - - Start a fio server, listening on IP 1.2.3.4 on the default port. - -6) fio --server=sock:/tmp/fio.sock - - Start a fio server, listening on the local socket /tmp/fio.sock. - -Once a server is running, a "client" can connect to the fio server with: - -fio --local-args --client= --remote-args - -where --local-args are arguments for the client where it is -running, 'server' is the connect string, and --remote-args and -are sent to the server. The 'server' string follows the same format as it -does on the server side, to allow IP/hostname/socket and port strings. +For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required +to be installed. gfio isn't built automatically and can be enabled with a +``--enable-gfio`` option to configure. -Fio can connect to multiple servers this way: +To build fio with a cross-compiler:: -fio --client= --client= + $ make clean + $ make CROSS_COMPILE=/path/to/toolchain/prefix -If the job file is located on the fio server, then you can tell the server -to load a local file as well. This is done by using --remote-config: +Configure will attempt to determine the target platform automatically. -fio --client=server --remote-config /path/to/file.fio +It's possible to build fio for ESX as well, use the ``--esx`` switch to +configure. -Then fio will open this local (to the server) job file instead -of being passed one from the client. -If you have many servers (example: 100 VMs/containers), -you can input a pathname of a file containing host IPs/names as the parameter -value for the --client option. For example, here is an example "host.list" -file containing 2 hostnames: +Windows +~~~~~~~ -host1.your.dns.domain -host2.your.dns.domain +On Windows, Cygwin (http://www.cygwin.com/) is required in order to build +fio. To create an MSI installer package install WiX 3.8 from +http://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows` +directory. -The fio command would then be: +How to compile fio on 64-bit Windows: -fio --client=host.list + 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all + packages starting with **mingw64-i686** and **mingw64-x86_64**. + 2. Open the Cygwin Terminal. + 3. Go to the fio directory (source files). + 4. Run ``make clean && make -j``. -In this mode, you cannot input server-specific parameters or job files -- all -servers receive the same job file. +To build fio on 32-bit Windows, run ``./configure --build-32bit-win`` before +``make``. -In order to let fio --client runs use a shared filesystem -from multiple hosts, fio --client now prepends the IP address of the -server to the filename. For example, if fio is using directory /mnt/nfs/fio -and is writing filename fileio.tmp, with a --client hostfile containing -two hostnames h1 and h2 with IP addresses 192.168.10.120 and 192.168.10.121, -then fio will create two files: +It's recommended that once built or installed, fio be run in a Command Prompt or +other 'native' console such as console2, since there are known to be display and +signal issues when running it under a Cygwin shell (see +https://github.com/mintty/mintty/issues/56 and +https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs +for details). + + +Documentation +~~~~~~~~~~~~~ + +Fio uses Sphinx_ to generate documentation from the reStructuredText_ files. +To build HTML formatted documentation run ``make -C doc html`` and direct your +browser to :file:`./doc/output/html/index.html`. To build manual page run +``make -C doc man`` and then ``man doc/output/man/fio.1``. To see what other +output formats are supported run ``make -C doc help``. - /mnt/nfs/fio/192.168.10.120.fileio.tmp - /mnt/nfs/fio/192.168.10.121.fileio.tmp +.. _reStructuredText: http://www.sphinx-doc.org/rest.html +.. _Sphinx: http://www.sphinx-doc.org Platforms @@ -351,32 +204,31 @@ Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD, Windows, FreeBSD, and DragonFly. Some features and/or options may only be -available on some of the platforms, typically because those features only -apply to that platform (like the solarisaio engine, or the splice engine on -Linux). +available on some of the platforms, typically because those features only apply +to that platform (like the solarisaio engine, or the splice engine on Linux). Some features are not available on FreeBSD/Solaris even if they could be -implemented, I'd be happy to take patches for that. An example of that is -disk utility statistics and (I think) huge page support, support for that -does exist in FreeBSD/Solaris. - -Fio uses pthread mutexes for signalling and locking and FreeBSD does not -support process shared pthread mutexes. As a result, only threads are -supported on FreeBSD. This could be fixed with sysv ipc locking or -other locking alternatives. - -Other *BSD platforms are untested, but fio should work there almost out -of the box. Since I don't do test runs or even compiles on those platforms, -your mileage may vary. Sending me patches for other platforms is greatly +implemented, I'd be happy to take patches for that. An example of that is disk +utility statistics and (I think) huge page support, support for that does exist +in FreeBSD/Solaris. + +Fio uses pthread mutexes for signalling and locking and some platforms do not +support process shared pthread mutexes. As a result, on such platforms only +threads are supported. This could be fixed with sysv ipc locking or other +locking alternatives. + +Other \*BSD platforms are untested, but fio should work there almost out of the +box. Since I don't do test runs or even compiles on those platforms, your +mileage may vary. Sending me patches for other platforms is greatly appreciated. There's a lot of value in having the same test/benchmark tool available on all platforms. -Note that POSIX aio is not enabled by default on AIX. Messages like these: +Note that POSIX aio is not enabled by default on AIX. Messages like these:: Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because: Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix. -indicate one needs to enable POSIX aio. Run the following commands as root: +indicate one needs to enable POSIX aio. Run the following commands as root:: # lsdev -C -l posix_aio0 posix_aio0 Defined Posix Asynchronous I/O @@ -384,20 +236,41 @@ # lsdev -C -l posix_aio0 posix_aio0 Available Posix Asynchronous I/O -POSIX aio should work now. To make the change permanent: +POSIX aio should work now. To make the change permanent:: # chdev -l posix_aio0 -P -a autoconfig='available' posix_aio0 changed -Author ------- +Running fio +----------- -Fio was written by Jens Axboe to enable flexible testing -of the Linux IO subsystem and schedulers. He got tired of writing -specific test applications to simulate a given workload, and found that -the existing io benchmark/test tools out there weren't flexible enough -to do what he wanted. +Running fio is normally the easiest part - you just give it the job file +(or job files) as parameters:: -Jens Axboe 20060905 + $ fio [options] [jobfile] ... + +and it will start doing what the *jobfile* tells it to do. You can give more +than one job file on the command line, fio will serialize the running of those +files. Internally that is the same as using the :option:`stonewall` parameter +described in the parameter section. + +If the job file contains only one job, you may as well just give the parameters +on the command line. The command line parameters are identical to the job +parameters, with a few extra that control global parameters. For example, for +the job file parameter :option:`iodepth=2 `, the mirror command line +option would be :option:`--iodepth 2 ` or :option:`--iodepth=2 +`. You can also use the command line for giving more than one job +entry. For each :option:`--name ` option that fio sees, it will start a +new job with that name. Command line entries following a +:option:`--name ` entry will apply to that job, until there are no more +entries or a new :option:`--name ` entry is seen. This is similar to the +job file options, where each option applies to the current job until a new [] +job entry is seen. + +fio does not need to run as root, except if the files or devices specified in +the job section requires that. Some other options may also be restricted, such +as memory locking, I/O scheduler switching, and decreasing the nice value. +If *jobfile* is specified as ``-``, the job file will be read from standard +input. diff -Nru fio-2.16/server.c fio-3.1/server.c --- fio-2.16/server.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/server.c 2017-09-28 10:23:20.000000000 +0000 @@ -50,17 +50,6 @@ struct flist_head next; /* Other sk_entry's, if linked command */ }; -struct sk_out { - unsigned int refs; /* frees sk_out when it drops to zero. - * protected by below ->lock */ - - int sk; /* socket fd to talk to client */ - struct fio_mutex lock; /* protects ref and below list */ - struct flist_head list; /* list of pending transmit work */ - struct fio_mutex wait; /* wake backend when items added to list */ - struct fio_mutex xmit; /* held while sending data */ -}; - static char *fio_server_arg; static char *bind_sock; static struct sockaddr_in saddr_in; @@ -263,9 +252,10 @@ return fio_sendv_data(sk, &iov, 1); } -static int fio_recv_data(int sk, void *p, unsigned int len, bool wait) +static int fio_recv_data(int sk, void *buf, unsigned int len, bool wait) { int flags; + char *p = buf; if (wait) flags = MSG_WAITALL; @@ -388,7 +378,7 @@ break; /* There's payload, get it */ - pdu = (void *) cmdret->payload + pdu_offset; + pdu = (char *) cmdret->payload + pdu_offset; ret = fio_recv_data(sk, pdu, cmd.pdu_len, wait); if (ret) break; @@ -449,7 +439,7 @@ reply = calloc(1, sizeof(*reply)); INIT_FLIST_HEAD(&reply->list); - fio_gettime(&reply->tv, NULL); + fio_gettime(&reply->ts, NULL); reply->saved_tag = tag; reply->opcode = opcode; @@ -866,7 +856,7 @@ #ifdef CONFIG_BIG_ENDIAN probe.bigendian = 1; #endif - strncpy((char *) probe.fio_version, fio_version_string, sizeof(probe.fio_version)); + strncpy((char *) probe.fio_version, fio_version_string, sizeof(probe.fio_version) - 1); probe.os = FIO_OS; probe.arch = FIO_ARCH; @@ -980,6 +970,7 @@ } else fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, rep, sz, NULL, SK_F_FREE | SK_F_INLINE); + fio_terminate_threads(TERMINATE_ALL); exec_trigger(buf); return 0; } @@ -1290,7 +1281,7 @@ ret = getsockname(sk, sockaddr_p, &len); if (ret) { - log_err("fio: getsockaddr: %s\n", strerror(errno)); + log_err("fio: getsockname: %s\n", strerror(errno)); return -1; } @@ -1444,7 +1435,7 @@ dst->min_run[i] = cpu_to_le64(src->min_run[i]); dst->max_bw[i] = cpu_to_le64(src->max_bw[i]); dst->min_bw[i] = cpu_to_le64(src->min_bw[i]); - dst->io_kb[i] = cpu_to_le64(src->io_kb[i]); + dst->iobytes[i] = cpu_to_le64(src->iobytes[i]); dst->agg[i] = cpu_to_le64(src->agg[i]); } @@ -1485,6 +1476,7 @@ convert_io_stat(&p.ts.slat_stat[i], &ts->slat_stat[i]); convert_io_stat(&p.ts.lat_stat[i], &ts->lat_stat[i]); convert_io_stat(&p.ts.bw_stat[i], &ts->bw_stat[i]); + convert_io_stat(&p.ts.iops_stat[i], &ts->iops_stat[i]); } p.ts.usr_time = cpu_to_le64(ts->usr_time); @@ -1492,7 +1484,8 @@ p.ts.ctx = cpu_to_le64(ts->ctx); p.ts.minf = cpu_to_le64(ts->minf); p.ts.majf = cpu_to_le64(ts->majf); - p.ts.clat_percentiles = cpu_to_le64(ts->clat_percentiles); + p.ts.clat_percentiles = cpu_to_le32(ts->clat_percentiles); + p.ts.lat_percentiles = cpu_to_le32(ts->lat_percentiles); p.ts.percentile_precision = cpu_to_le64(ts->percentile_precision); for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) { @@ -1508,10 +1501,12 @@ p.ts.io_u_complete[i] = cpu_to_le32(ts->io_u_complete[i]); } - for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) { + for (i = 0; i < FIO_IO_U_LAT_N_NR; i++) + p.ts.io_u_lat_n[i] = cpu_to_le32(ts->io_u_lat_n[i]); + for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) p.ts.io_u_lat_u[i] = cpu_to_le32(ts->io_u_lat_u[i]); + for (i = 0; i < FIO_IO_U_LAT_M_NR; i++) p.ts.io_u_lat_m[i] = cpu_to_le32(ts->io_u_lat_m[i]); - } for (i = 0; i < DDIR_RWDIR_CNT; i++) for (j = 0; j < FIO_IO_U_PLAT_NR; j++) @@ -2279,7 +2274,7 @@ * For local domain sockets: * *ptr is the filename, *is_sock is 1. */ -int fio_server_parse_string(const char *str, char **ptr, int *is_sock, +int fio_server_parse_string(const char *str, char **ptr, bool *is_sock, int *port, struct in_addr *inp, struct in6_addr *inp6, int *ipv6) { @@ -2288,13 +2283,13 @@ int lport = 0; *ptr = NULL; - *is_sock = 0; + *is_sock = false; *port = fio_net_port; *ipv6 = 0; if (!strncmp(str, "sock:", 5)) { *ptr = strdup(str + 5); - *is_sock = 1; + *is_sock = true; return 0; } @@ -2373,7 +2368,8 @@ static int fio_handle_server_arg(void) { int port = fio_net_port; - int is_sock, ret = 0; + bool is_sock; + int ret = 0; saddr_in.sin_addr.s_addr = htonl(INADDR_ANY); @@ -2538,7 +2534,7 @@ pid = fork(); if (pid < 0) { - log_err("fio: failed server fork: %s", strerror(errno)); + log_err("fio: failed server fork: %s\n", strerror(errno)); free(pidfile); return -1; } else if (pid) { diff -Nru fio-2.16/server.h fio-3.1/server.h --- fio-2.16/server.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/server.h 2017-09-28 10:23:20.000000000 +0000 @@ -12,6 +12,17 @@ #define FIO_NET_PORT 8765 +struct sk_out { + unsigned int refs; /* frees sk_out when it drops to zero. + * protected by below ->lock */ + + int sk; /* socket fd to talk to client */ + struct fio_mutex lock; /* protects ref and below list */ + struct flist_head list; /* list of pending transmit work */ + struct fio_mutex wait; /* wake backend when items added to list */ + struct fio_mutex xmit; /* held while sending data */ +}; + /* * On-wire encoding is little endian */ @@ -32,13 +43,13 @@ struct fio_net_cmd_reply { struct flist_head list; - struct timeval tv; + struct timespec ts; uint64_t saved_tag; uint16_t opcode; }; enum { - FIO_SERVER_VER = 60, + FIO_SERVER_VER = 66, FIO_SERVER_MAX_FRAGMENT_PDU = 1024, FIO_SERVER_MAX_CMD_MB = 2048, @@ -201,7 +212,7 @@ extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *); extern int fio_net_send_simple_cmd(int, uint16_t, uint64_t, struct flist_head *); extern void fio_server_set_arg(const char *); -extern int fio_server_parse_string(const char *, char **, int *, int *, struct in_addr *, struct in6_addr *, int *); +extern int fio_server_parse_string(const char *, char **, bool *, int *, struct in_addr *, struct in6_addr *, int *); extern int fio_server_parse_host(const char *, int, struct in_addr *, struct in6_addr *); extern const char *fio_server_op(unsigned int); extern void fio_server_got_signal(int); diff -Nru fio-2.16/smalloc.c fio-3.1/smalloc.c --- fio-2.16/smalloc.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/smalloc.c 2017-09-28 10:23:20.000000000 +0000 @@ -13,6 +13,7 @@ #include #include +#include "fio.h" #include "mutex.h" #include "arch/arch.h" #include "os/os.h" @@ -188,7 +189,7 @@ goto out_fail; pool->map = ptr; - pool->bitmap = (void *) ptr + (pool->nr_blocks * SMALLOC_BPL); + pool->bitmap = (unsigned int *)((char *) ptr + (pool->nr_blocks * SMALLOC_BPL)); memset(pool->bitmap, 0, bitmap_blocks * sizeof(unsigned int)); pool->lock = fio_mutex_init(FIO_MUTEX_UNLOCKED); @@ -248,7 +249,7 @@ uintptr_t ptr; ptr = (uintptr_t) hdr + hdr->size - sizeof(unsigned int); - ptr = (ptr + int_mask) & ~int_mask; + ptr = (uintptr_t) PTR_ALIGN(ptr, int_mask); return (void *) ptr; } diff -Nru fio-2.16/stat.c fio-3.1/stat.c --- fio-2.16/stat.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/stat.c 2017-09-28 10:23:20.000000000 +0000 @@ -37,9 +37,9 @@ struct thread_stat *ts = &td->ts; fio_getrusage(&td->ru_end); - ts->usr_time += mtime_since(&td->ru_start.ru_utime, + ts->usr_time += mtime_since_tv(&td->ru_start.ru_utime, &td->ru_end.ru_utime); - ts->sys_time += mtime_since(&td->ru_start.ru_stime, + ts->sys_time += mtime_since_tv(&td->ru_start.ru_stime, &td->ru_end.ru_stime); ts->ctx += td->ru_end.ru_nvcsw + td->ru_end.ru_nivcsw - (td->ru_start.ru_nvcsw + td->ru_start.ru_nivcsw); @@ -58,7 +58,7 @@ * group by looking at the index bits. * */ -static unsigned int plat_val_to_idx(unsigned int val) +static unsigned int plat_val_to_idx(unsigned long long val) { unsigned int msb, error_bits, base, offset, idx; @@ -66,7 +66,7 @@ if (val == 0) msb = 0; else - msb = (sizeof(val)*8) - __builtin_clz(val) - 1; + msb = (sizeof(val)*8) - __builtin_clzll(val) - 1; /* * MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use @@ -98,9 +98,10 @@ * Convert the given index of the bucket array to the value * represented by the bucket */ -static unsigned int plat_idx_to_val(unsigned int idx) +static unsigned long long plat_idx_to_val(unsigned int idx) { - unsigned int error_bits, k, base; + unsigned int error_bits; + unsigned long long k, base; assert(idx < FIO_IO_U_PLAT_NR); @@ -111,7 +112,7 @@ /* Find the group and compute the minimum value of that group */ error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1; - base = 1 << (error_bits + FIO_IO_U_PLAT_BITS); + base = ((unsigned long long) 1) << (error_bits + FIO_IO_U_PLAT_BITS); /* Find its bucket number of the group */ k = idx % FIO_IO_U_PLAT_VAL; @@ -135,16 +136,16 @@ } unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr, - fio_fp64_t *plist, unsigned int **output, - unsigned int *maxv, unsigned int *minv) + fio_fp64_t *plist, unsigned long long **output, + unsigned long long *maxv, unsigned long long *minv) { unsigned long sum = 0; unsigned int len, i, j = 0; unsigned int oval_len = 0; - unsigned int *ovals = NULL; - int is_last; + unsigned long long *ovals = NULL; + bool is_last; - *minv = -1U; + *minv = -1ULL; *maxv = 0; len = 0; @@ -165,7 +166,7 @@ /* * Calculate bucket values, note down max and min values */ - is_last = 0; + is_last = false; for (i = 0; i < FIO_IO_U_PLAT_NR && !is_last; i++) { sum += io_u_plat[i]; while (sum >= (plist[j].u.f / 100.0 * nr)) { @@ -173,7 +174,7 @@ if (j == oval_len) { oval_len += 100; - ovals = realloc(ovals, oval_len * sizeof(unsigned int)); + ovals = realloc(ovals, oval_len * sizeof(*ovals)); } ovals[j] = plat_idx_to_val(i); @@ -182,7 +183,7 @@ if (ovals[j] > *maxv) *maxv = ovals[j]; - is_last = (j == len - 1); + is_last = (j == len - 1) != 0; if (is_last) break; @@ -199,11 +200,14 @@ */ static void show_clat_percentiles(unsigned int *io_u_plat, unsigned long nr, fio_fp64_t *plist, unsigned int precision, - struct buf_output *out) + bool is_clat, struct buf_output *out) { - unsigned int len, j = 0, minv, maxv; - unsigned int *ovals; - int is_last, per_line, scale_down; + unsigned int divisor, len, i, j = 0; + unsigned long long minv, maxv; + unsigned long long *ovals; + int per_line, scale_down, time_width; + const char *pre = is_clat ? "clat" : " lat"; + bool is_last; char fmt[32]; len = calc_clat_percentiles(io_u_plat, nr, plist, &ovals, &maxv, &minv); @@ -211,39 +215,42 @@ goto out; /* - * We default to usecs, but if the value range is such that we - * should scale down to msecs, do that. + * We default to nsecs, but if the value range is such that we + * should scale down to usecs or msecs, do that. */ - if (minv > 2000 && maxv > 99999) { + if (minv > 2000000 && maxv > 99999999ULL) { + scale_down = 2; + divisor = 1000000; + log_buf(out, " %s percentiles (msec):\n |", pre); + } else if (minv > 2000 && maxv > 99999) { scale_down = 1; - log_buf(out, " clat percentiles (msec):\n |"); + divisor = 1000; + log_buf(out, " %s percentiles (usec):\n |", pre); } else { scale_down = 0; - log_buf(out, " clat percentiles (usec):\n |"); + divisor = 1; + log_buf(out, " %s percentiles (nsec):\n |", pre); } - snprintf(fmt, sizeof(fmt), "%%1.%uf", precision); - per_line = (80 - 7) / (precision + 14); - for (j = 0; j < len; j++) { - char fbuf[16], *ptr = fbuf; + time_width = max(5, (int) (log10(maxv / divisor) + 1)); + snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3, + precision, time_width); + /* fmt will be something like " %5.2fth=[%4llu]%c" */ + per_line = (80 - 7) / (precision + 10 + time_width); + for (j = 0; j < len; j++) { /* for formatting */ if (j != 0 && (j % per_line) == 0) log_buf(out, " |"); /* end of the list */ - is_last = (j == len - 1); - - if (plist[j].u.f < 10.0) - ptr += sprintf(fbuf, " "); + is_last = (j == len - 1) != 0; - snprintf(ptr, sizeof(fbuf), fmt, plist[j].u.f); - - if (scale_down) + for (i = 0; i < scale_down; i++) ovals[j] = (ovals[j] + 999) / 1000; - log_buf(out, " %sth=[%5u]%c", fbuf, ovals[j], is_last ? '\n' : ','); + log_buf(out, fmt, plist[j].u.f, ovals[j], is_last ? '\n' : ','); if (is_last) break; @@ -257,8 +264,8 @@ free(ovals); } -bool calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max, - double *mean, double *dev) +bool calc_lat(struct io_stat *is, unsigned long long *min, + unsigned long long *max, double *mean, double *dev) { double n = (double) is->samples; @@ -279,7 +286,8 @@ void show_group_stats(struct group_run_stats *rs, struct buf_output *out) { - char *p1, *p2, *p3, *p4; + char *io, *agg, *min, *max; + char *ioalt, *aggalt, *minalt, *maxalt; const char *str[] = { " READ", " WRITE" , " TRIM"}; int i; @@ -291,22 +299,28 @@ if (!rs->max_run[i]) continue; - p1 = num2str(rs->io_kb[i], 6, rs->kb_base, i2p, 8); - p2 = num2str(rs->agg[i], 6, rs->kb_base, i2p, rs->unit_base); - p3 = num2str(rs->min_bw[i], 6, rs->kb_base, i2p, rs->unit_base); - p4 = num2str(rs->max_bw[i], 6, rs->kb_base, i2p, rs->unit_base); - - log_buf(out, "%s: io=%s, aggrb=%s/s, minb=%s/s, maxb=%s/s," - " mint=%llumsec, maxt=%llumsec\n", + io = num2str(rs->iobytes[i], 4, 1, i2p, N2S_BYTE); + ioalt = num2str(rs->iobytes[i], 4, 1, !i2p, N2S_BYTE); + agg = num2str(rs->agg[i], 4, 1, i2p, rs->unit_base); + aggalt = num2str(rs->agg[i], 4, 1, !i2p, rs->unit_base); + min = num2str(rs->min_bw[i], 4, 1, i2p, rs->unit_base); + minalt = num2str(rs->min_bw[i], 4, 1, !i2p, rs->unit_base); + max = num2str(rs->max_bw[i], 4, 1, i2p, rs->unit_base); + maxalt = num2str(rs->max_bw[i], 4, 1, !i2p, rs->unit_base); + log_buf(out, "%s: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n", rs->unified_rw_rep ? " MIXED" : str[i], - p1, p2, p3, p4, + agg, aggalt, min, max, minalt, maxalt, io, ioalt, (unsigned long long) rs->min_run[i], (unsigned long long) rs->max_run[i]); - free(p1); - free(p2); - free(p3); - free(p4); + free(io); + free(agg); + free(min); + free(max); + free(ioalt); + free(aggalt); + free(minalt); + free(maxalt); } } @@ -348,6 +362,28 @@ } } +/* + * To keep the terse format unaltered, add all of the ns latency + * buckets to the first us latency bucket + */ +void stat_calc_lat_nu(struct thread_stat *ts, double *io_u_lat_u) +{ + unsigned long ntotal = 0, total = ddir_rw_sum(ts->total_io_u); + int i; + + stat_calc_lat(ts, io_u_lat_u, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR); + + for (i = 0; i < FIO_IO_U_LAT_N_NR; i++) + ntotal += ts->io_u_lat_n[i]; + + io_u_lat_u[0] += 100.0 * (double) ntotal / (double) total; +} + +void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat) +{ + stat_calc_lat(ts, io_u_lat, ts->io_u_lat_n, FIO_IO_U_LAT_N_NR); +} + void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat) { stat_calc_lat(ts, io_u_lat, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR); @@ -358,17 +394,20 @@ stat_calc_lat(ts, io_u_lat, ts->io_u_lat_m, FIO_IO_U_LAT_M_NR); } -static void display_lat(const char *name, unsigned long min, unsigned long max, - double mean, double dev, struct buf_output *out) +static void display_lat(const char *name, unsigned long long min, + unsigned long long max, double mean, double dev, + struct buf_output *out) { - const char *base = "(usec)"; + const char *base = "(nsec)"; char *minp, *maxp; - if (usec_to_msec(&min, &max, &mean, &dev)) + if (nsec_to_msec(&min, &max, &mean, &dev)) base = "(msec)"; + else if (nsec_to_usec(&min, &max, &mean, &dev)) + base = "(usec)"; - minp = num2str(min, 6, 1, 0, 0); - maxp = num2str(max, 6, 1, 0, 0); + minp = num2str(min, 6, 1, 0, N2S_NONE); + maxp = num2str(max, 6, 1, 0, N2S_NONE); log_buf(out, " %s %s: min=%s, max=%s, avg=%5.02f," " stdev=%5.02f\n", name, base, minp, maxp, mean, dev); @@ -380,11 +419,11 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts, int ddir, struct buf_output *out) { - const char *str[] = { "read ", "write", "trim" }; - unsigned long min, max, runt; - unsigned long long bw, iops; + const char *str[] = { " read", "write", " trim" }; + unsigned long runt; + unsigned long long min, max, bw, iops; double mean, dev; - char *io_p, *bw_p, *iops_p; + char *io_p, *bw_p, *bw_p_alt, *iops_p; int i2p; assert(ddir_rw(ddir)); @@ -396,19 +435,21 @@ runt = ts->runtime[ddir]; bw = (1000 * ts->io_bytes[ddir]) / runt; - io_p = num2str(ts->io_bytes[ddir], 6, 1, i2p, 8); - bw_p = num2str(bw, 6, 1, i2p, ts->unit_base); + io_p = num2str(ts->io_bytes[ddir], 4, 1, i2p, N2S_BYTE); + bw_p = num2str(bw, 4, 1, i2p, ts->unit_base); + bw_p_alt = num2str(bw, 4, 1, !i2p, ts->unit_base); iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt; - iops_p = num2str(iops, 6, 1, 0, 0); + iops_p = num2str(iops, 4, 1, 0, N2S_NONE); - log_buf(out, " %s: io=%s, bw=%s/s, iops=%s, runt=%6llumsec\n", - rs->unified_rw_rep ? "mixed" : str[ddir], - io_p, bw_p, iops_p, - (unsigned long long) ts->runtime[ddir]); + log_buf(out, " %s: IOPS=%s, BW=%s (%s)(%s/%llumsec)\n", + rs->unified_rw_rep ? "mixed" : str[ddir], + iops_p, bw_p, bw_p_alt, io_p, + (unsigned long long) ts->runtime[ddir]); free(io_p); free(bw_p); + free(bw_p_alt); free(iops_p); if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev)) @@ -418,15 +459,31 @@ if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev)) display_lat(" lat", min, max, mean, dev, out); - if (ts->clat_percentiles) { + if (ts->clat_percentiles || ts->lat_percentiles) { show_clat_percentiles(ts->io_u_plat[ddir], ts->clat_stat[ddir].samples, ts->percentile_list, - ts->percentile_precision, out); + ts->percentile_precision, + ts->clat_percentiles, out); } if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) { double p_of_agg = 100.0, fkb_base = (double)rs->kb_base; - const char *bw_str = (rs->unit_base == 1 ? "Kbit" : "KB"); + const char *bw_str; + + if ((rs->unit_base == 1) && i2p) + bw_str = "Kibit"; + else if (rs->unit_base == 1) + bw_str = "kbit"; + else if (i2p) + bw_str = "KiB"; + else + bw_str = "kB"; + + if (rs->agg[ddir]) { + p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024); + if (p_of_agg > 100.0) + p_of_agg = 100.0; + } if (rs->unit_base == 1) { min *= 8.0; @@ -435,40 +492,41 @@ dev *= 8.0; } - if (rs->agg[ddir]) { - p_of_agg = mean * 100 / (double) rs->agg[ddir]; - if (p_of_agg > 100.0) - p_of_agg = 100.0; - } - if (mean > fkb_base * fkb_base) { min /= fkb_base; max /= fkb_base; mean /= fkb_base; dev /= fkb_base; - bw_str = (rs->unit_base == 1 ? "Mbit" : "MB"); + bw_str = (rs->unit_base == 1 ? "Mibit" : "MiB"); } - log_buf(out, " bw (%-4s/s): min=%5lu, max=%5lu, per=%3.2f%%," - " avg=%5.02f, stdev=%5.02f\n", bw_str, min, max, - p_of_agg, mean, dev); + log_buf(out, " bw (%5s/s): min=%5llu, max=%5llu, per=%3.2f%%, " + "avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n", + bw_str, min, max, p_of_agg, mean, dev, + (&ts->bw_stat[ddir])->samples); + } + if (calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) { + log_buf(out, " iops : min=%5llu, max=%5llu, " + "avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n", + min, max, mean, dev, (&ts->iops_stat[ddir])->samples); } } -static int show_lat(double *io_u_lat, int nr, const char **ranges, - const char *msg, struct buf_output *out) +static bool show_lat(double *io_u_lat, int nr, const char **ranges, + const char *msg, struct buf_output *out) { - int new_line = 1, i, line = 0, shown = 0; + bool new_line = true, shown = false; + int i, line = 0; for (i = 0; i < nr; i++) { if (io_u_lat[i] <= 0.0) continue; - shown = 1; + shown = true; if (new_line) { if (line) log_buf(out, "\n"); - log_buf(out, " lat (%s) : ", msg); - new_line = 0; + log_buf(out, " lat (%s) : ", msg); + new_line = false; line = 0; } if (line) @@ -476,13 +534,21 @@ log_buf(out, "%s%3.2f%%", ranges[i], io_u_lat[i]); line++; if (line == 5) - new_line = 1; + new_line = true; } if (shown) log_buf(out, "\n"); - return shown; + return true; +} + +static void show_lat_n(double *io_u_lat_n, struct buf_output *out) +{ + const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=", + "250=", "500=", "750=", "1000=", }; + + show_lat(io_u_lat_n, FIO_IO_U_LAT_N_NR, ranges, "nsec", out); } static void show_lat_u(double *io_u_lat_u, struct buf_output *out) @@ -504,12 +570,15 @@ static void show_latencies(struct thread_stat *ts, struct buf_output *out) { + double io_u_lat_n[FIO_IO_U_LAT_N_NR]; double io_u_lat_u[FIO_IO_U_LAT_U_NR]; double io_u_lat_m[FIO_IO_U_LAT_M_NR]; + stat_calc_lat_n(ts, io_u_lat_n); stat_calc_lat_u(ts, io_u_lat_u); stat_calc_lat_m(ts, io_u_lat_m); + show_lat_n(io_u_lat_n, out); show_lat_u(io_u_lat_u, out); show_lat_m(io_u_lat_m, out); } @@ -659,7 +728,7 @@ static void show_ss_normal(struct thread_stat *ts, struct buf_output *out) { - char *p1, *p2; + char *p1, *p1alt, *p2; unsigned long long bw_mean, iops_mean; const int i2p = is_power_of_2(ts->kb_base); @@ -669,18 +738,20 @@ bw_mean = steadystate_bw_mean(ts); iops_mean = steadystate_iops_mean(ts); - p1 = num2str(bw_mean / ts->kb_base, 6, ts->kb_base, i2p, ts->unit_base); - p2 = num2str(iops_mean, 6, 1, 0, 0); + p1 = num2str(bw_mean / ts->kb_base, 4, ts->kb_base, i2p, ts->unit_base); + p1alt = num2str(bw_mean / ts->kb_base, 4, ts->kb_base, !i2p, ts->unit_base); + p2 = num2str(iops_mean, 4, 1, 0, N2S_NONE); - log_buf(out, " steadystate : attained=%s, bw=%s/s, iops=%s, %s%s=%.3f%s\n", + log_buf(out, " steadystate : attained=%s, bw=%s (%s), iops=%s, %s%s=%.3f%s\n", ts->ss_state & __FIO_SS_ATTAINED ? "yes" : "no", - p1, p2, + p1, p1alt, p2, ts->ss_state & __FIO_SS_IOPS ? "iops" : "bw", ts->ss_state & __FIO_SS_SLOPE ? " slope": " mean dev", ts->ss_criterion.u.f, ts->ss_state & __FIO_SS_PCT ? "%" : ""); free(p1); + free(p1alt); free(p2); } @@ -761,9 +832,9 @@ io_u_dist[1], io_u_dist[2], io_u_dist[3], io_u_dist[4], io_u_dist[5], io_u_dist[6]); - log_buf(out, " issued : total=r=%llu/w=%llu/d=%llu," - " short=r=%llu/w=%llu/d=%llu," - " drop=r=%llu/w=%llu/d=%llu\n", + log_buf(out, " issued rwt: total=%llu,%llu,%llu," + " short=%llu,%llu,%llu," + " dropped=%llu,%llu,%llu\n", (unsigned long long) ts->total_io_u[0], (unsigned long long) ts->total_io_u[1], (unsigned long long) ts->total_io_u[2], @@ -797,14 +868,13 @@ static void show_ddir_status_terse(struct thread_stat *ts, struct group_run_stats *rs, int ddir, - struct buf_output *out) + int ver, struct buf_output *out) { - unsigned long min, max; - unsigned long long bw, iops; - unsigned int *ovals = NULL; + unsigned long long min, max, minv, maxv, bw, iops; + unsigned long long *ovals = NULL; double mean, dev; - unsigned int len, minv, maxv; - int i; + unsigned int len; + int i, bw_stat; assert(ddir_rw(ddir)); @@ -812,7 +882,7 @@ if (ts->runtime[ddir]) { uint64_t runt = ts->runtime[ddir]; - bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024; + bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024; /* KiB/s */ iops = (1000 * (uint64_t) ts->total_io_u[ddir]) / runt; } @@ -821,16 +891,16 @@ (unsigned long long) ts->runtime[ddir]); if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev)) - log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev); + log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000); else - log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0); + log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0); if (calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev)) - log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev); + log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000); else - log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0); + log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0); - if (ts->clat_percentiles) { + if (ts->clat_percentiles || ts->lat_percentiles) { len = calc_clat_percentiles(ts->io_u_plat[ddir], ts->clat_stat[ddir].samples, ts->percentile_list, &ovals, &maxv, @@ -843,39 +913,53 @@ log_buf(out, ";0%%=0"); continue; } - log_buf(out, ";%f%%=%u", ts->percentile_list[i].u.f, ovals[i]); + log_buf(out, ";%f%%=%llu", ts->percentile_list[i].u.f, ovals[i]/1000); } if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev)) - log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev); + log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000); else - log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0); + log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0); if (ovals) free(ovals); - if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) { + bw_stat = calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev); + if (bw_stat) { double p_of_agg = 100.0; if (rs->agg[ddir]) { - p_of_agg = mean * 100 / (double) rs->agg[ddir]; + p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024); if (p_of_agg > 100.0) p_of_agg = 100.0; } - log_buf(out, ";%lu;%lu;%f%%;%f;%f", min, max, p_of_agg, mean, dev); + log_buf(out, ";%llu;%llu;%f%%;%f;%f", min, max, p_of_agg, mean, dev); } else - log_buf(out, ";%lu;%lu;%f%%;%f;%f", 0UL, 0UL, 0.0, 0.0, 0.0); + log_buf(out, ";%llu;%llu;%f%%;%f;%f", 0ULL, 0ULL, 0.0, 0.0, 0.0); + + if (ver == 5) { + if (bw_stat) + log_buf(out, ";%" PRIu64, (&ts->bw_stat[ddir])->samples); + else + log_buf(out, ";%lu", 0UL); + + if (calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) + log_buf(out, ";%llu;%llu;%f;%f;%" PRIu64, min, max, + mean, dev, (&ts->iops_stat[ddir])->samples); + else + log_buf(out, ";%llu;%llu;%f;%f;%lu", 0ULL, 0ULL, 0.0, 0.0, 0UL); + } } static void add_ddir_status_json(struct thread_stat *ts, struct group_run_stats *rs, int ddir, struct json_object *parent) { - unsigned long min, max; + unsigned long long min, max, minv, maxv; unsigned long long bw; - unsigned int *ovals = NULL; + unsigned long long *ovals = NULL; double mean, dev, iops; - unsigned int len, minv, maxv; + unsigned int len; int i; const char *ddirname[] = {"read", "write", "trim"}; struct json_object *dir_object, *tmp_object, *percentile_object, *clat_bins_object; @@ -896,11 +980,12 @@ if (ts->runtime[ddir]) { uint64_t runt = ts->runtime[ddir]; - bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024; + bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024; /* KiB/s */ iops = (1000.0 * (uint64_t) ts->total_io_u[ddir]) / runt; } - json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir] >> 10); + json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir]); + json_object_add_value_int(dir_object, "io_kbytes", ts->io_bytes[ddir] >> 10); json_object_add_value_int(dir_object, "bw", bw); json_object_add_value_float(dir_object, "iops", iops); json_object_add_value_int(dir_object, "runtime", ts->runtime[ddir]); @@ -913,7 +998,7 @@ mean = dev = 0.0; } tmp_object = json_create_object(); - json_object_add_value_object(dir_object, "slat", tmp_object); + json_object_add_value_object(dir_object, "slat_ns", tmp_object); json_object_add_value_int(tmp_object, "min", min); json_object_add_value_int(tmp_object, "max", max); json_object_add_value_float(tmp_object, "mean", mean); @@ -924,13 +1009,13 @@ mean = dev = 0.0; } tmp_object = json_create_object(); - json_object_add_value_object(dir_object, "clat", tmp_object); + json_object_add_value_object(dir_object, "clat_ns", tmp_object); json_object_add_value_int(tmp_object, "min", min); json_object_add_value_int(tmp_object, "max", max); json_object_add_value_float(tmp_object, "mean", mean); json_object_add_value_float(tmp_object, "stddev", dev); - if (ts->clat_percentiles) { + if (ts->clat_percentiles || ts->lat_percentiles) { len = calc_clat_percentiles(ts->io_u_plat[ddir], ts->clat_stat[ddir].samples, ts->percentile_list, &ovals, &maxv, @@ -953,12 +1038,11 @@ clat_bins_object = json_create_object(); json_object_add_value_object(tmp_object, "bins", clat_bins_object); for(i = 0; i < FIO_IO_U_PLAT_NR; i++) { - snprintf(buf, sizeof(buf), "%d", i); - json_object_add_value_int(clat_bins_object, (const char *)buf, ts->io_u_plat[ddir][i]); + if (ts->io_u_plat[ddir][i]) { + snprintf(buf, sizeof(buf), "%llu", plat_idx_to_val(i)); + json_object_add_value_int(clat_bins_object, (const char *)buf, ts->io_u_plat[ddir][i]); + } } - json_object_add_value_int(clat_bins_object, "FIO_IO_U_PLAT_BITS", FIO_IO_U_PLAT_BITS); - json_object_add_value_int(clat_bins_object, "FIO_IO_U_PLAT_VAL", FIO_IO_U_PLAT_VAL); - json_object_add_value_int(clat_bins_object, "FIO_IO_U_PLAT_NR", FIO_IO_U_PLAT_NR); } if (!calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev)) { @@ -966,7 +1050,7 @@ mean = dev = 0.0; } tmp_object = json_create_object(); - json_object_add_value_object(dir_object, "lat", tmp_object); + json_object_add_value_object(dir_object, "lat_ns", tmp_object); json_object_add_value_int(tmp_object, "min", min); json_object_add_value_int(tmp_object, "max", max); json_object_add_value_float(tmp_object, "mean", mean); @@ -976,7 +1060,7 @@ if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) { if (rs->agg[ddir]) { - p_of_agg = mean * 100 / (double) rs->agg[ddir]; + p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024); if (p_of_agg > 100.0) p_of_agg = 100.0; } @@ -989,74 +1073,24 @@ json_object_add_value_float(dir_object, "bw_agg", p_of_agg); json_object_add_value_float(dir_object, "bw_mean", mean); json_object_add_value_float(dir_object, "bw_dev", dev); -} - -static void show_thread_status_terse_v2(struct thread_stat *ts, - struct group_run_stats *rs, - struct buf_output *out) -{ - double io_u_dist[FIO_IO_U_MAP_NR]; - double io_u_lat_u[FIO_IO_U_LAT_U_NR]; - double io_u_lat_m[FIO_IO_U_LAT_M_NR]; - double usr_cpu, sys_cpu; - int i; - - /* General Info */ - log_buf(out, "2;%s;%d;%d", ts->name, ts->groupid, ts->error); - /* Log Read Status */ - show_ddir_status_terse(ts, rs, DDIR_READ, out); - /* Log Write Status */ - show_ddir_status_terse(ts, rs, DDIR_WRITE, out); - /* Log Trim Status */ - show_ddir_status_terse(ts, rs, DDIR_TRIM, out); + json_object_add_value_int(dir_object, "bw_samples", + (&ts->bw_stat[ddir])->samples); - /* CPU Usage */ - if (ts->total_run_time) { - double runt = (double) ts->total_run_time; - - usr_cpu = (double) ts->usr_time * 100 / runt; - sys_cpu = (double) ts->sys_time * 100 / runt; - } else { - usr_cpu = 0; - sys_cpu = 0; + if (!calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) { + min = max = 0; + mean = dev = 0.0; } - - log_buf(out, ";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu, - (unsigned long long) ts->ctx, - (unsigned long long) ts->majf, - (unsigned long long) ts->minf); - - /* Calc % distribution of IO depths, usecond, msecond latency */ - stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist); - stat_calc_lat_u(ts, io_u_lat_u); - stat_calc_lat_m(ts, io_u_lat_m); - - /* Only show fixed 7 I/O depth levels*/ - log_buf(out, ";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%", - io_u_dist[0], io_u_dist[1], io_u_dist[2], io_u_dist[3], - io_u_dist[4], io_u_dist[5], io_u_dist[6]); - - /* Microsecond latency */ - for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) - log_buf(out, ";%3.2f%%", io_u_lat_u[i]); - /* Millisecond latency */ - for (i = 0; i < FIO_IO_U_LAT_M_NR; i++) - log_buf(out, ";%3.2f%%", io_u_lat_m[i]); - /* Additional output if continue_on_error set - default off*/ - if (ts->continue_on_error) - log_buf(out, ";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error); - log_buf(out, "\n"); - - /* Additional output if description is set */ - if (strlen(ts->description)) - log_buf(out, ";%s", ts->description); - - log_buf(out, "\n"); + json_object_add_value_int(dir_object, "iops_min", min); + json_object_add_value_int(dir_object, "iops_max", max); + json_object_add_value_float(dir_object, "iops_mean", mean); + json_object_add_value_float(dir_object, "iops_stddev", dev); + json_object_add_value_int(dir_object, "iops_samples", + (&ts->iops_stat[ddir])->samples); } -static void show_thread_status_terse_v3_v4(struct thread_stat *ts, - struct group_run_stats *rs, int ver, - struct buf_output *out) +static void show_thread_status_terse_all(struct thread_stat *ts, + struct group_run_stats *rs, int ver, + struct buf_output *out) { double io_u_dist[FIO_IO_U_MAP_NR]; double io_u_lat_u[FIO_IO_U_LAT_U_NR]; @@ -1065,15 +1099,19 @@ int i; /* General Info */ - log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string, - ts->name, ts->groupid, ts->error); + if (ver == 2) + log_buf(out, "2;%s;%d;%d", ts->name, ts->groupid, ts->error); + else + log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string, + ts->name, ts->groupid, ts->error); + /* Log Read Status */ - show_ddir_status_terse(ts, rs, DDIR_READ, out); + show_ddir_status_terse(ts, rs, DDIR_READ, ver, out); /* Log Write Status */ - show_ddir_status_terse(ts, rs, DDIR_WRITE, out); + show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out); /* Log Trim Status */ - if (ver == 4) - show_ddir_status_terse(ts, rs, DDIR_TRIM, out); + if (ver == 2 || ver == 4 || ver == 5) + show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out); /* CPU Usage */ if (ts->total_run_time) { @@ -1093,7 +1131,7 @@ /* Calc % distribution of IO depths, usecond, msecond latency */ stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist); - stat_calc_lat_u(ts, io_u_lat_u); + stat_calc_lat_nu(ts, io_u_lat_u); stat_calc_lat_m(ts, io_u_lat_m); /* Only show fixed 7 I/O depth levels*/ @@ -1109,11 +1147,14 @@ log_buf(out, ";%3.2f%%", io_u_lat_m[i]); /* disk util stats, if any */ - show_disk_util(1, NULL, out); + if (ver >= 3) + show_disk_util(1, NULL, out); /* Additional output if continue_on_error set - default off*/ if (ts->continue_on_error) log_buf(out, ";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error); + if (ver == 2) + log_buf(out, "\n"); /* Additional output if description is set */ if (strlen(ts->description)) @@ -1154,6 +1195,7 @@ struct json_object *root, *tmp; struct jobs_eta *je; double io_u_dist[FIO_IO_U_MAP_NR]; + double io_u_lat_n[FIO_IO_U_LAT_N_NR]; double io_u_lat_u[FIO_IO_U_LAT_U_NR]; double io_u_lat_m[FIO_IO_U_LAT_M_NR]; double usr_cpu, sys_cpu; @@ -1198,6 +1240,7 @@ /* Calc % distribution of IO depths, usecond, msecond latency */ stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist); + stat_calc_lat_n(ts, io_u_lat_n); stat_calc_lat_u(ts, io_u_lat_u); stat_calc_lat_m(ts, io_u_lat_m); @@ -1213,9 +1256,17 @@ json_object_add_value_float(tmp, (const char *)name, io_u_dist[i]); } + /* Nanosecond latency */ tmp = json_create_object(); - json_object_add_value_object(root, "latency_us", tmp); + json_object_add_value_object(root, "latency_ns", tmp); + for (i = 0; i < FIO_IO_U_LAT_N_NR; i++) { + const char *ranges[] = { "2", "4", "10", "20", "50", "100", + "250", "500", "750", "1000", }; + json_object_add_value_float(tmp, ranges[i], io_u_lat_n[i]); + } /* Microsecond latency */ + tmp = json_create_object(); + json_object_add_value_object(root, "latency_us", tmp); for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) { const char *ranges[] = { "2", "4", "10", "20", "50", "100", "250", "500", "750", "1000", }; @@ -1344,10 +1395,8 @@ struct group_run_stats *rs, struct buf_output *out) { - if (terse_version == 2) - show_thread_status_terse_v2(ts, rs, out); - else if (terse_version == 3 || terse_version == 4) - show_thread_status_terse_v3_v4(ts, rs, terse_version, out); + if (terse_version >= 2 && terse_version <= 5) + show_thread_status_terse_all(ts, rs, terse_version, out); else log_err("fio: bad terse version!? %d\n", terse_version); } @@ -1418,7 +1467,7 @@ if (dst->min_bw[i] && dst->min_bw[i] > src->min_bw[i]) dst->min_bw[i] = src->min_bw[i]; - dst->io_kb[i] += src->io_kb[i]; + dst->iobytes[i] += src->iobytes[i]; dst->agg[i] += src->agg[i]; } @@ -1439,6 +1488,7 @@ sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first); sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first); sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first); + sum_stat(&dst->iops_stat[l], &src->iops_stat[l], first); dst->io_bytes[l] += src->io_bytes[l]; @@ -1449,6 +1499,7 @@ sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first); sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first); sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first); + sum_stat(&dst->iops_stat[0], &src->iops_stat[l], first); dst->io_bytes[0] += src->io_bytes[l]; @@ -1475,6 +1526,8 @@ dst->io_u_submit[k] += src->io_u_submit[k]; for (k = 0; k < FIO_IO_U_MAP_NR; k++) dst->io_u_complete[k] += src->io_u_complete[k]; + for (k = 0; k < FIO_IO_U_LAT_N_NR; k++) + dst->io_u_lat_n[k] += src->io_u_lat_n[k]; for (k = 0; k < FIO_IO_U_LAT_U_NR; k++) dst->io_u_lat_u[k] += src->io_u_lat_u[k]; for (k = 0; k < FIO_IO_U_LAT_M_NR; k++) @@ -1528,6 +1581,7 @@ ts->clat_stat[j].min_val = -1UL; ts->slat_stat[j].min_val = -1UL; ts->bw_stat[j].min_val = -1UL; + ts->iops_stat[j].min_val = -1UL; } ts->groupid = -1; } @@ -1538,8 +1592,8 @@ struct thread_data *td; struct thread_stat *threadstats, *ts; int i, j, k, nr_ts, last_ts, idx; - int kb_base_warned = 0; - int unit_base_warned = 0; + bool kb_base_warned = false; + bool unit_base_warned = false; struct json_object *root = NULL; struct json_array *array = NULL; struct buf_output output[FIO_OUTPUT_NR]; @@ -1563,6 +1617,8 @@ } if (last_ts == td->groupid) continue; + if (!td->o.stats) + continue; last_ts = td->groupid; nr_ts++; @@ -1580,6 +1636,8 @@ last_ts = -1; idx = 0; for_each_td(td, i) { + if (!td->o.stats) + continue; if (idx && (!td->o.group_reporting || (td->o.group_reporting && last_ts != td->groupid))) { idx = 0; @@ -1591,6 +1649,7 @@ ts = &threadstats[j]; ts->clat_percentiles = td->o.clat_percentiles; + ts->lat_percentiles = td->o.lat_percentiles; ts->percentile_precision = td->o.percentile_precision; memcpy(ts->percentile_list, td->o.percentile_list, sizeof(td->o.percentile_list)); opt_lists[j] = &td->opt_list; @@ -1627,11 +1686,11 @@ } else if (ts->kb_base != td->o.kb_base && !kb_base_warned) { log_info("fio: kb_base differs for jobs in group, using" " %u as the base\n", ts->kb_base); - kb_base_warned = 1; + kb_base_warned = true; } else if (ts->unit_base != td->o.unit_base && !unit_base_warned) { log_info("fio: unit_base differs for jobs in group, using" " %u as the base\n", ts->unit_base); - unit_base_warned = 1; + unit_base_warned = true; } ts->continue_on_error = td->o.continue_on_error; @@ -1696,19 +1755,14 @@ rs->max_run[j] = ts->runtime[j]; bw = 0; - if (ts->runtime[j]) { - unsigned long runt = ts->runtime[j]; - unsigned long long kb; - - kb = ts->io_bytes[j] / rs->kb_base; - bw = kb * 1000 / runt; - } + if (ts->runtime[j]) + bw = ts->io_bytes[j] * 1000 / ts->runtime[j]; if (bw < rs->min_bw[j]) rs->min_bw[j] = bw; if (bw > rs->max_bw[j]) rs->max_bw[j] = bw; - rs->io_kb[j] += ts->io_bytes[j] / rs->kb_base; + rs->iobytes[j] += ts->io_bytes[j]; } } @@ -1719,7 +1773,7 @@ for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) { if (rs->max_run[ddir]) - rs->agg[ddir] = (rs->io_kb[ddir] * 1000) / + rs->agg[ddir] = (rs->iobytes[ddir] * 1000) / rs->max_run[ddir]; } } @@ -1808,8 +1862,10 @@ } for (i = 0; i < FIO_OUTPUT_NR; i++) { - buf_output_flush(&output[i]); - buf_output_free(&output[i]); + struct buf_output *out = &output[i]; + + log_info_buf(out->buf, out->buflen); + buf_output_free(out); } log_info_flush(); @@ -1829,22 +1885,22 @@ { struct thread_data *td; unsigned long long *rt; - struct timeval tv; + struct timespec ts; int i; fio_mutex_down(stat_mutex); rt = malloc(thread_number * sizeof(unsigned long long)); - fio_gettime(&tv, NULL); + fio_gettime(&ts, NULL); for_each_td(td, i) { td->update_rusage = 1; td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ]; td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE]; td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM]; - td->ts.total_run_time = mtime_since(&td->epoch, &tv); + td->ts.total_run_time = mtime_since(&td->epoch, &ts); - rt[i] = mtime_since(&td->start, &tv); + rt[i] = mtime_since(&td->start, &ts); if (td_read(td) && td->ts.io_bytes[DDIR_READ]) td->ts.runtime[DDIR_READ] += rt[i]; if (td_write(td) && td->ts.io_bytes[DDIR_WRITE]) @@ -1878,9 +1934,9 @@ fio_mutex_up(stat_mutex); } -static int status_interval_init; -static struct timeval status_time; -static int status_file_disabled; +static bool status_interval_init; +static struct timespec status_time; +static bool status_file_disabled; #define FIO_STATUS_FILE "fio-dump-status" @@ -1911,7 +1967,7 @@ log_err("fio: failed to unlink %s: %s\n", fio_status_file_path, strerror(errno)); log_err("fio: disabling status file updates\n"); - status_file_disabled = 1; + status_file_disabled = true; } return 1; @@ -1922,7 +1978,7 @@ if (status_interval) { if (!status_interval_init) { fio_gettime(&status_time, NULL); - status_interval_init = 1; + status_interval_init = true; } else if (mtime_since_now(&status_time) >= status_interval) { show_running_run_stats(); fio_gettime(&status_time, NULL); @@ -1935,7 +1991,7 @@ } } -static inline void add_stat_sample(struct io_stat *is, unsigned long data) +static inline void add_stat_sample(struct io_stat *is, unsigned long long data) { double val = data; double delta; @@ -2108,7 +2164,7 @@ if (iolog->disabled) return; if (flist_empty(&iolog->io_logs)) - iolog->avg_last = t; + iolog->avg_last[ddir] = t; cur_log = get_cur_log(iolog); if (cur_log) { @@ -2154,6 +2210,9 @@ ts->io_bytes[i] = 0; ts->runtime[i] = 0; + ts->total_io_u[i] = 0; + ts->short_io_u[i] = 0; + ts->drop_io_u[i] = 0; for (j = 0; j < FIO_IO_U_PLAT_NR; j++) ts->io_u_plat[i][j] = 0; @@ -2163,17 +2222,17 @@ ts->io_u_map[i] = 0; ts->io_u_submit[i] = 0; ts->io_u_complete[i] = 0; + } + + for (i = 0; i < FIO_IO_U_LAT_N_NR; i++) + ts->io_u_lat_n[i] = 0; + for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) ts->io_u_lat_u[i] = 0; + for (i = 0; i < FIO_IO_U_LAT_M_NR; i++) ts->io_u_lat_m[i] = 0; - ts->total_submit = 0; - ts->total_complete = 0; - } - for (i = 0; i < 3; i++) { - ts->total_io_u[i] = 0; - ts->short_io_u[i] = 0; - ts->drop_io_u[i] = 0; - } + ts->total_submit = 0; + ts->total_complete = 0; } static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir, @@ -2236,9 +2295,9 @@ * If period hasn't passed, adding the above sample is all we * need to do. */ - this_window = elapsed - iolog->avg_last; - if (elapsed < iolog->avg_last) - return iolog->avg_last - elapsed; + this_window = elapsed - iolog->avg_last[ddir]; + if (elapsed < iolog->avg_last[ddir]) + return iolog->avg_last[ddir] - elapsed; else if (this_window < iolog->avg_msec) { int diff = iolog->avg_msec - this_window; @@ -2246,9 +2305,9 @@ return diff; } - _add_stat_to_log(iolog, elapsed, td->o.log_max != 0); + __add_stat_to_log(iolog, ddir, elapsed, td->o.log_max != 0); - iolog->avg_last = elapsed - (this_window - iolog->avg_msec); + iolog->avg_last[ddir] = elapsed - (this_window - iolog->avg_msec); return iolog->avg_msec; } @@ -2282,16 +2341,16 @@ } static void add_clat_percentile_sample(struct thread_stat *ts, - unsigned long usec, enum fio_ddir ddir) + unsigned long long nsec, enum fio_ddir ddir) { - unsigned int idx = plat_val_to_idx(usec); + unsigned int idx = plat_val_to_idx(nsec); assert(idx < FIO_IO_U_PLAT_NR); ts->io_u_plat[ddir][idx]++; } void add_clat_sample(struct thread_data *td, enum fio_ddir ddir, - unsigned long usec, unsigned int bs, uint64_t offset) + unsigned long long nsec, unsigned int bs, uint64_t offset) { unsigned long elapsed, this_window; struct thread_stat *ts = &td->ts; @@ -2299,14 +2358,14 @@ td_io_u_lock(td); - add_stat_sample(&ts->clat_stat[ddir], usec); + add_stat_sample(&ts->clat_stat[ddir], nsec); if (td->clat_log) - add_log_sample(td, td->clat_log, sample_val(usec), ddir, bs, + add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs, offset); if (ts->clat_percentiles) - add_clat_percentile_sample(ts, usec, ddir); + add_clat_percentile_sample(ts, nsec, ddir); if (iolog && iolog->hist_msec) { struct io_hist *hw = &iolog->hist_window[ddir]; @@ -2368,7 +2427,7 @@ } void add_lat_sample(struct thread_data *td, enum fio_ddir ddir, - unsigned long usec, unsigned int bs, uint64_t offset) + unsigned long long nsec, unsigned int bs, uint64_t offset) { struct thread_stat *ts = &td->ts; @@ -2377,23 +2436,26 @@ td_io_u_lock(td); - add_stat_sample(&ts->lat_stat[ddir], usec); + add_stat_sample(&ts->lat_stat[ddir], nsec); if (td->lat_log) - add_log_sample(td, td->lat_log, sample_val(usec), ddir, bs, + add_log_sample(td, td->lat_log, sample_val(nsec), ddir, bs, offset); + if (ts->lat_percentiles) + add_clat_percentile_sample(ts, nsec, ddir); + td_io_u_unlock(td); } void add_bw_sample(struct thread_data *td, struct io_u *io_u, - unsigned int bytes, unsigned long spent) + unsigned int bytes, unsigned long long spent) { struct thread_stat *ts = &td->ts; unsigned long rate; if (spent) - rate = bytes * 1000 / spent; + rate = (unsigned long) (bytes * 1000000ULL / spent); else rate = 0; @@ -2409,64 +2471,76 @@ td_io_u_unlock(td); } -static int add_bw_samples(struct thread_data *td, struct timeval *t) +static int __add_samples(struct thread_data *td, struct timespec *parent_tv, + struct timespec *t, unsigned int avg_time, + uint64_t *this_io_bytes, uint64_t *stat_io_bytes, + struct io_stat *stat, struct io_log *log, + bool is_kb) { - struct thread_stat *ts = &td->ts; unsigned long spent, rate; enum fio_ddir ddir; unsigned int next, next_log; - next_log = td->o.bw_avg_time; + next_log = avg_time; - spent = mtime_since(&td->bw_sample_time, t); - if (spent < td->o.bw_avg_time && - td->o.bw_avg_time - spent >= LOG_MSEC_SLACK) - return td->o.bw_avg_time - spent; + spent = mtime_since(parent_tv, t); + if (spent < avg_time && avg_time - spent >= LOG_MSEC_SLACK) + return avg_time - spent; td_io_u_lock(td); /* * Compute both read and write rates for the interval. */ - for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) { + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) { uint64_t delta; - delta = td->this_io_bytes[ddir] - td->stat_io_bytes[ddir]; + delta = this_io_bytes[ddir] - stat_io_bytes[ddir]; if (!delta) continue; /* No entries for interval */ - if (spent) - rate = delta * 1000 / spent / 1024; - else + if (spent) { + if (is_kb) + rate = delta * 1000 / spent / 1024; /* KiB/s */ + else + rate = (delta * 1000) / spent; + } else rate = 0; - add_stat_sample(&ts->bw_stat[ddir], rate); + add_stat_sample(&stat[ddir], rate); - if (td->bw_log) { + if (log) { unsigned int bs = 0; if (td->o.min_bs[ddir] == td->o.max_bs[ddir]) bs = td->o.min_bs[ddir]; - next = add_log_sample(td, td->bw_log, sample_val(rate), - ddir, bs, 0); + next = add_log_sample(td, log, sample_val(rate), ddir, bs, 0); next_log = min(next_log, next); } - td->stat_io_bytes[ddir] = td->this_io_bytes[ddir]; + stat_io_bytes[ddir] = this_io_bytes[ddir]; } - timeval_add_msec(&td->bw_sample_time, td->o.bw_avg_time); + timespec_add_msec(parent_tv, avg_time); td_io_u_unlock(td); - if (spent <= td->o.bw_avg_time) - return min(next_log, td->o.bw_avg_time); + if (spent <= avg_time) + next = avg_time; + else + next = avg_time - (1 + spent - avg_time); - next = td->o.bw_avg_time - (1 + spent - td->o.bw_avg_time); return min(next, next_log); } +static int add_bw_samples(struct thread_data *td, struct timespec *t) +{ + return __add_samples(td, &td->bw_sample_time, t, td->o.bw_avg_time, + td->this_io_bytes, td->stat_io_bytes, + td->ts.bw_stat, td->bw_log, true); +} + void add_iops_sample(struct thread_data *td, struct io_u *io_u, unsigned int bytes) { @@ -2484,62 +2558,11 @@ td_io_u_unlock(td); } -static int add_iops_samples(struct thread_data *td, struct timeval *t) +static int add_iops_samples(struct thread_data *td, struct timespec *t) { - struct thread_stat *ts = &td->ts; - unsigned long spent, iops; - enum fio_ddir ddir; - unsigned int next, next_log; - - next_log = td->o.iops_avg_time; - - spent = mtime_since(&td->iops_sample_time, t); - if (spent < td->o.iops_avg_time && - td->o.iops_avg_time - spent >= LOG_MSEC_SLACK) - return td->o.iops_avg_time - spent; - - td_io_u_lock(td); - - /* - * Compute both read and write rates for the interval. - */ - for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) { - uint64_t delta; - - delta = td->this_io_blocks[ddir] - td->stat_io_blocks[ddir]; - if (!delta) - continue; /* No entries for interval */ - - if (spent) - iops = (delta * 1000) / spent; - else - iops = 0; - - add_stat_sample(&ts->iops_stat[ddir], iops); - - if (td->iops_log) { - unsigned int bs = 0; - - if (td->o.min_bs[ddir] == td->o.max_bs[ddir]) - bs = td->o.min_bs[ddir]; - - next = add_log_sample(td, td->iops_log, - sample_val(iops), ddir, bs, 0); - next_log = min(next_log, next); - } - - td->stat_io_blocks[ddir] = td->this_io_blocks[ddir]; - } - - timeval_add_msec(&td->iops_sample_time, td->o.iops_avg_time); - - td_io_u_unlock(td); - - if (spent <= td->o.iops_avg_time) - return min(next_log, td->o.iops_avg_time); - - next = td->o.iops_avg_time - (1 + spent - td->o.iops_avg_time); - return min(next, next_log); + return __add_samples(td, &td->iops_sample_time, t, td->o.iops_avg_time, + td->this_io_blocks, td->stat_io_blocks, + td->ts.iops_stat, td->iops_log, false); } /* @@ -2549,23 +2572,27 @@ { struct thread_data *td; unsigned int next = ~0U, tmp; - struct timeval now; + struct timespec now; int i; fio_gettime(&now, NULL); for_each_td(td, i) { + if (!td->o.stats) + continue; if (in_ramp_time(td) || !(td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING)) { next = min(td->o.iops_avg_time, td->o.bw_avg_time); continue; } - if (td->bw_log && !per_unit_log(td->bw_log)) { + if (!td->bw_log || + (td->bw_log && !per_unit_log(td->bw_log))) { tmp = add_bw_samples(td, &now); if (tmp < next) next = tmp; } - if (td->iops_log && !per_unit_log(td->iops_log)) { + if (!td->iops_log || + (td->iops_log && !per_unit_log(td->iops_log))) { tmp = add_iops_samples(td, &now); if (tmp < next) next = tmp; diff -Nru fio-2.16/stat.h fio-3.1/stat.h --- fio-2.16/stat.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/stat.h 2017-09-28 10:23:20.000000000 +0000 @@ -7,7 +7,7 @@ struct group_run_stats { uint64_t max_run[DDIR_RWDIR_CNT], min_run[DDIR_RWDIR_CNT]; uint64_t max_bw[DDIR_RWDIR_CNT], min_bw[DDIR_RWDIR_CNT]; - uint64_t io_kb[DDIR_RWDIR_CNT]; + uint64_t iobytes[DDIR_RWDIR_CNT]; uint64_t agg[DDIR_RWDIR_CNT]; uint32_t kb_base; uint32_t unit_base; @@ -19,6 +19,7 @@ * How many depth levels to log */ #define FIO_IO_U_MAP_NR 7 +#define FIO_IO_U_LAT_N_NR 10 #define FIO_IO_U_LAT_U_NR 10 #define FIO_IO_U_LAT_M_NR 12 @@ -108,7 +109,7 @@ #define FIO_IO_U_PLAT_BITS 6 #define FIO_IO_U_PLAT_VAL (1 << FIO_IO_U_PLAT_BITS) -#define FIO_IO_U_PLAT_GROUP_NR 19 +#define FIO_IO_U_PLAT_GROUP_NR 29 #define FIO_IO_U_PLAT_NR (FIO_IO_U_PLAT_GROUP_NR * FIO_IO_U_PLAT_VAL) #define FIO_IO_U_LIST_MAX_LEN 20 /* The size of the default and user-specified list of percentiles */ @@ -171,13 +172,15 @@ /* * IO depth and latency stats */ - uint64_t clat_percentiles; + uint32_t clat_percentiles; + uint32_t lat_percentiles; uint64_t percentile_precision; fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN]; uint32_t io_u_map[FIO_IO_U_MAP_NR]; uint32_t io_u_submit[FIO_IO_U_MAP_NR]; uint32_t io_u_complete[FIO_IO_U_MAP_NR]; + uint32_t io_u_lat_n[FIO_IO_U_LAT_N_NR]; uint32_t io_u_lat_u[FIO_IO_U_LAT_U_NR]; uint32_t io_u_lat_m[FIO_IO_U_LAT_M_NR]; uint32_t io_u_plat[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR]; @@ -242,17 +245,17 @@ uint32_t nr_pending; uint32_t nr_setting_up; - uint32_t files_open; - uint64_t m_rate[DDIR_RWDIR_CNT], t_rate[DDIR_RWDIR_CNT]; - uint32_t m_iops[DDIR_RWDIR_CNT], t_iops[DDIR_RWDIR_CNT]; uint64_t rate[DDIR_RWDIR_CNT]; + uint32_t m_iops[DDIR_RWDIR_CNT], t_iops[DDIR_RWDIR_CNT]; uint32_t iops[DDIR_RWDIR_CNT]; uint64_t elapsed_sec; uint64_t eta_sec; uint32_t is_pow2; uint32_t unit_base; + uint32_t files_open; + /* * Network 'copy' of run_str[] */ @@ -286,8 +289,9 @@ extern void init_thread_stat(struct thread_stat *ts); extern void init_group_run_stat(struct group_run_stats *gs); extern void eta_to_str(char *str, unsigned long eta_sec); -extern bool calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max, double *mean, double *dev); -extern unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr, fio_fp64_t *plist, unsigned int **output, unsigned int *maxv, unsigned int *minv); +extern bool calc_lat(struct io_stat *is, unsigned long long *min, unsigned long long *max, double *mean, double *dev); +extern unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr, fio_fp64_t *plist, unsigned long long **output, unsigned long long *maxv, unsigned long long *minv); +extern void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat); extern void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat); extern void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat); extern void stat_calc_dist(unsigned int *map, unsigned long total, double *io_u_dist); @@ -295,9 +299,9 @@ extern void update_rusage_stat(struct thread_data *); extern void clear_rusage_stat(struct thread_data *); -extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long, +extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long, unsigned int, uint64_t); -extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long, +extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long, unsigned int, uint64_t); extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long, unsigned int, uint64_t); @@ -305,16 +309,17 @@ extern void add_iops_sample(struct thread_data *, struct io_u *, unsigned int); extern void add_bw_sample(struct thread_data *, struct io_u *, - unsigned int, unsigned long); + unsigned int, unsigned long long); extern int calc_log_samples(void); extern struct io_log *agg_io_log[DDIR_RWDIR_CNT]; extern int write_bw_log; -static inline bool usec_to_msec(unsigned long *min, unsigned long *max, - double *mean, double *dev) +static inline bool nsec_to_usec(unsigned long long *min, + unsigned long long *max, double *mean, + double *dev) { - if (*min > 1000 && *max > 1000 && *mean > 1000.0 && *dev > 1000.0) { + if (*min > 2000 && *max > 99999 && *dev > 1000.0) { *min /= 1000; *max /= 1000; *mean /= 1000.0; @@ -324,6 +329,22 @@ return false; } + +static inline bool nsec_to_msec(unsigned long long *min, + unsigned long long *max, double *mean, + double *dev) +{ + if (*min > 2000000 && *max > 99999999ULL && *dev > 1000000.0) { + *min /= 1000000; + *max /= 1000000; + *mean /= 1000000.0; + *dev /= 1000000.0; + return true; + } + + return false; +} + /* * Worst level condensing would be 1:5, so allow enough room for that */ diff -Nru fio-2.16/steadystate.c fio-3.1/steadystate.c --- fio-2.16/steadystate.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/steadystate.c 2017-09-28 10:23:20.000000000 +0000 @@ -8,13 +8,8 @@ static void steadystate_alloc(struct thread_data *td) { - int i; - - td->ss.bw_data = malloc(td->ss.dur * sizeof(uint64_t)); - td->ss.iops_data = malloc(td->ss.dur * sizeof(uint64_t)); - /* initialize so that it is obvious if the cache is not full in the output */ - for (i = 0; i < td->ss.dur; i++) - td->ss.iops_data[i] = td->ss.bw_data[i] = 0; + td->ss.bw_data = calloc(td->ss.dur, sizeof(uint64_t)); + td->ss.iops_data = calloc(td->ss.dur, sizeof(uint64_t)); td->ss.state |= __FIO_SS_DATA; } @@ -201,7 +196,7 @@ int i, j, ddir, prev_groupid, group_ramp_time_over = 0; unsigned long rate_time; struct thread_data *td, *td2; - struct timeval now; + struct timespec now; uint64_t group_bw = 0, group_iops = 0; uint64_t td_iops, td_bytes; bool ret; @@ -236,7 +231,7 @@ } td_io_u_lock(td); - for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) { + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) { td_iops += td->io_blocks[ddir]; td_bytes += td->io_bytes[ddir]; } diff -Nru fio-2.16/steadystate.h fio-3.1/steadystate.h --- fio-2.16/steadystate.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/steadystate.h 2017-09-28 10:23:20.000000000 +0000 @@ -35,7 +35,7 @@ uint64_t sum_xy; uint64_t oldest_y; - struct timeval prev_time; + struct timespec prev_time; uint64_t prev_iops; uint64_t prev_bytes; }; diff -Nru fio-2.16/t/arch.c fio-3.1/t/arch.c --- fio-2.16/t/arch.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/arch.c 2017-09-28 10:23:20.000000000 +0000 @@ -1,5 +1,5 @@ #include "../arch/arch.h" unsigned long arch_flags = 0; -int tsc_reliable; +bool tsc_reliable; int arch_random; diff -Nru fio-2.16/t/axmap.c fio-3.1/t/axmap.c --- fio-2.16/t/axmap.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/axmap.c 2017-09-28 10:23:20.000000000 +0000 @@ -8,16 +8,6 @@ #include "../lib/lfsr.h" #include "../lib/axmap.h" -void *smalloc(size_t size) -{ - return malloc(size); -} - -void sfree(void *ptr) -{ - free(ptr); -} - static int test_regular(size_t size, int seed) { struct fio_lfsr lfsr; diff -Nru fio-2.16/t/btrace2fio.c fio-3.1/t/btrace2fio.c --- fio-2.16/t/btrace2fio.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/btrace2fio.c 2017-09-28 10:23:20.000000000 +0000 @@ -62,7 +62,7 @@ uint64_t first_ttime[DDIR_RWDIR_CNT]; uint64_t last_ttime[DDIR_RWDIR_CNT]; - uint64_t kb[DDIR_RWDIR_CNT]; + uint64_t kib[DDIR_RWDIR_CNT]; uint64_t start_delay; }; @@ -406,7 +406,7 @@ i = inflight_find(t->sector + (t->bytes >> 9)); if (i) { - i->p->o.kb[t_to_rwdir(t)] += (t->bytes >> 10); + i->p->o.kib[t_to_rwdir(t)] += (t->bytes >> 10); i->p->o.complete_seen = 1; inflight_remove(i); } @@ -556,7 +556,7 @@ return bsb->nr - bsa->nr; } -static unsigned long o_to_kb_rate(struct btrace_out *o, int rw) +static unsigned long o_to_kib_rate(struct btrace_out *o, int rw) { uint64_t usec = (o->last_ttime[rw] - o->first_ttime[rw]) / 1000ULL; uint64_t val; @@ -568,7 +568,7 @@ if (!usec) return 0; - val = o->kb[rw] * 1000ULL; + val = o->kib[rw] * 1000ULL; return val / usec; } @@ -623,7 +623,7 @@ printf("\tmerges: %lu (perc=%3.2f%%)\n", o->merges[i], perc); perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i]; printf("\tseq: %lu (perc=%3.2f%%)\n", (unsigned long) o->seq[i], perc); - printf("\trate: %lu KB/sec\n", o_to_kb_rate(o, i)); + printf("\trate: %lu KiB/sec\n", o_to_kib_rate(o, i)); for (j = 0; j < o->nr_bs[i]; j++) { struct bs *bs = &o->bs[i][j]; @@ -746,7 +746,7 @@ for (i = 0; i < DDIR_RWDIR_CNT; i++) { unsigned long rate; - rate = o_to_kb_rate(o, i); + rate = o_to_kib_rate(o, i); if (i) printf(","); if (rate) @@ -810,7 +810,7 @@ for (i = 0; i < DDIR_RWDIR_CNT; i++) { unsigned long this_rate; - this_rate = o_to_kb_rate(o, i); + this_rate = o_to_kib_rate(o, i); if (this_rate < rate_threshold) { remove_ddir(o, i); this_rate = 0; @@ -926,7 +926,7 @@ oa->ios[i] += ob->ios[i]; oa->merges[i] += ob->merges[i]; oa->seq[i] += ob->seq[i]; - oa->kb[i] += ob->kb[i]; + oa->kib[i] += ob->kib[i]; oa->first_ttime[i] = min(oa->first_ttime[i], ob->first_ttime[i]); oa->last_ttime[i] = max(oa->last_ttime[i], ob->last_ttime[i]); merge_bs(&oa->bs[i], &oa->nr_bs[i], ob->bs[i], ob->nr_bs[i]); @@ -1021,7 +1021,7 @@ log_err("\t-n\tNumber IOS threshold to ignore task\n"); log_err("\t-f\tFio job file output\n"); log_err("\t-d\tUse this file/device for replay\n"); - log_err("\t-r\tIgnore jobs with less than this KB/sec rate\n"); + log_err("\t-r\tIgnore jobs with less than this KiB/sec rate\n"); log_err("\t-R\tSet rate in fio job (def=%u)\n", set_rate); log_err("\t-D\tCap queue depth at this value (def=%u)\n", max_depth); log_err("\t-c\tCollapse \"identical\" jobs (def=%u)\n", collapse_entries); diff -Nru fio-2.16/t/debug.c fio-3.1/t/debug.c --- fio-2.16/t/debug.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/debug.c 2017-09-28 10:23:20.000000000 +0000 @@ -1,7 +1,7 @@ #include FILE *f_err; -struct timeval *fio_tv = NULL; +struct timespec *fio_ts = NULL; unsigned long fio_debug = 0; void __dprint(int type, const char *str, ...) diff -Nru fio-2.16/t/dedupe.c fio-3.1/t/dedupe.c --- fio-2.16/t/dedupe.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/dedupe.c 2017-09-28 10:23:20.000000000 +0000 @@ -14,7 +14,6 @@ #include #include -#include "../lib/rbtree.h" #include "../flist.h" #include "../log.h" #include "../mutex.h" @@ -25,6 +24,7 @@ #include "../os/os.h" #include "../gettime.h" #include "../fio_time.h" +#include "../lib/rbtree.h" #include "../lib/bloom.h" #include "debug.h" @@ -334,7 +334,7 @@ static void show_progress(struct worker_thread *threads, unsigned long total) { unsigned long last_nitems = 0; - struct timeval last_tv; + struct timespec last_tv; fio_gettime(&last_tv, NULL); @@ -363,7 +363,7 @@ tdiff = mtime_since_now(&last_tv); if (tdiff) { this_items = (this_items * 1000) / (tdiff * 1024); - printf("%3.2f%% done (%luKB/sec)\r", perc, this_items); + printf("%3.2f%% done (%luKiB/sec)\r", perc, this_items); last_nitems = nitems; fio_gettime(&last_tv, NULL); } else diff -Nru fio-2.16/t/genzipf.c fio-3.1/t/genzipf.c --- fio-2.16/t/genzipf.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/genzipf.c 2017-09-28 10:23:20.000000000 +0000 @@ -3,8 +3,8 @@ * what an access pattern would look like. * * For instance, the following would generate a zipf distribution - * with theta 1.2, using 262144 (1 GB / 4096) values and split the reporting into - * 20 buckets: + * with theta 1.2, using 262144 (1 GiB / 4096) values and split the + * reporting into 20 buckets: * * ./t/fio-genzipf -t zipf -i 1.2 -g 1 -b 4096 -o 20 * @@ -49,7 +49,7 @@ }; static int dist_type = TYPE_ZIPF; -static unsigned long gb_size = 500; +static unsigned long gib_size = 500; static unsigned long block_size = 4096; static unsigned long output_nranges = DEF_NR_OUTPUT; static double percentage; @@ -131,7 +131,7 @@ } break; case 'g': - gb_size = strtoul(optarg, NULL, 10); + gib_size = strtoul(optarg, NULL, 10); break; case 'i': dist_val = atof(optarg); @@ -291,9 +291,10 @@ return 1; if (output_type != OUTPUT_CSV) - printf("Generating %s distribution with %f input and %lu GB size and %lu block_size.\n", dist_types[dist_type], dist_val, gb_size, block_size); + printf("Generating %s distribution with %f input and %lu GiB size and %lu block_size.\n", + dist_types[dist_type], dist_val, gib_size, block_size); - nranges = gb_size * 1024 * 1024 * 1024ULL; + nranges = gib_size * 1024 * 1024 * 1024ULL; nranges /= block_size; if (dist_type == TYPE_ZIPF) diff -Nru fio-2.16/t/lfsr-test.c fio-3.1/t/lfsr-test.c --- fio-2.16/t/lfsr-test.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/lfsr-test.c 2017-09-28 10:23:20.000000000 +0000 @@ -27,7 +27,7 @@ int main(int argc, char *argv[]) { int r; - struct timeval start, end; + struct timespec start, end; struct fio_lfsr *fl; int verify = 0; unsigned int spin = 0; @@ -80,7 +80,7 @@ v_size = numbers * sizeof(uint8_t); v = malloc(v_size); memset(v, 0, v_size); - printf("\nVerification table is %lf KBs\n", (double)(v_size) / 1024); + printf("\nVerification table is %lf KiB\n", (double)(v_size) / 1024); } v_start = v; diff -Nru fio-2.16/t/log.c fio-3.1/t/log.c --- fio-2.16/t/log.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/log.c 2017-09-28 10:23:20.000000000 +0000 @@ -2,7 +2,7 @@ #include #include "../minmax.h" -int log_err(const char *format, ...) +size_t log_err(const char *format, ...) { char buffer[1024]; va_list args; @@ -16,7 +16,7 @@ return fwrite(buffer, len, 1, stderr); } -int log_info(const char *format, ...) +size_t log_info(const char *format, ...) { char buffer[1024]; va_list args; diff -Nru fio-2.16/t/memlock.c fio-3.1/t/memlock.c --- fio-2.16/t/memlock.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/memlock.c 2017-09-28 10:23:20.000000000 +0000 @@ -4,7 +4,7 @@ #include static struct thread_data { - unsigned long mb; + unsigned long mib; } td; static void *worker(void *data) @@ -15,14 +15,14 @@ char *buf; int i, first = 1; - size = td->mb * 1024UL * 1024UL; + size = td->mib * 1024UL * 1024UL; buf = malloc(size); for (i = 0; i < 100000; i++) { for (index = 0; index + 4096 < size; index += 4096) memset(&buf[index+512], 0x89, 512); if (first) { - printf("loop%d: did %lu MB\n", i+1, size/(1024UL*1024UL)); + printf("loop%d: did %lu MiB\n", i+1, size/(1024UL*1024UL)); first = 0; } } @@ -31,20 +31,20 @@ int main(int argc, char *argv[]) { - unsigned long mb, threads; + unsigned long mib, threads; pthread_t *pthreads; int i; if (argc < 3) { - printf("%s: \n", argv[0]); + printf("%s: \n", argv[0]); return 1; } - mb = strtoul(argv[1], NULL, 10); + mib = strtoul(argv[1], NULL, 10); threads = strtoul(argv[2], NULL, 10); pthreads = calloc(threads, sizeof(pthread_t)); - td.mb = mb; + td.mib = mib; for (i = 0; i < threads; i++) pthread_create(&pthreads[i], NULL, worker, &td); diff -Nru fio-2.16/t/read-to-pipe-async.c fio-3.1/t/read-to-pipe-async.c --- fio-2.16/t/read-to-pipe-async.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/read-to-pipe-async.c 2017-09-28 10:23:20.000000000 +0000 @@ -661,9 +661,9 @@ bytes /= 1024; rate = (bytes * 1000UL * 1000UL) / utime_since(&s, &re); - fprintf(stderr, "Read rate (KB/sec) : %lu\n", rate); + fprintf(stderr, "Read rate (KiB/sec) : %lu\n", rate); rate = (bytes * 1000UL * 1000UL) / utime_since(&s, &we); - fprintf(stderr, "Write rate (KB/sec): %lu\n", rate); + fprintf(stderr, "Write rate (KiB/sec): %lu\n", rate); close(fd); return 0; diff -Nru fio-2.16/t/stest.c fio-3.1/t/stest.c --- fio-2.16/t/stest.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/stest.c 2017-09-28 10:23:20.000000000 +0000 @@ -59,15 +59,6 @@ return 0; } -static int do_specific_alloc(unsigned long size) -{ - void *ptr; - - ptr = smalloc(size); - sfree(ptr); - return 0; -} - int main(int argc, char *argv[]) { arch_init(argv); @@ -76,9 +67,6 @@ do_rand_allocs(); - /* smalloc bug, commit 271067a6 */ - do_specific_alloc(671386584); - scleanup(); return 0; } diff -Nru fio-2.16/t/time-test.c fio-3.1/t/time-test.c --- fio-2.16/t/time-test.c 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/t/time-test.c 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,544 @@ +/* + * Carry out arithmetic to explore conversion of CPU clock ticks to nsec + * + * When we use the CPU clock for timing, we do the following: + * + * 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks + * to actual time. + * + * Using gettimeofday() or clock_gettime(), count how many CPU clock + * ticks occur per usec + * + * 2) Calculate conversion factors so that we can ultimately convert + * from clocks ticks to nsec with + * nsec = (ticks * clock_mult) >> clock_shift + * + * This is equivalent to + * nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER + * where + * clock_mult = MULTIPLIER / cycles_per_nsec + * MULTIPLIER = 2^clock_shift + * + * It would be simpler to just calculate nsec = ticks / cycles_per_nsec, + * but all of this is necessary because of rounding when calculating + * cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply + * be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional + * portion is lost with integer arithmetic. + * + * This multiply and shift calculation also has a performance benefit + * as multiplication and bit shift operations are faster than integer + * division. + * + * 3) Dynamically determine clock_shift and clock_mult at run time based + * on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum + * duration for which the conversion will be valid. + * + * The primary constraint is that (ticks * clock_mult) must not overflow + * when ticks is at its maximum value. + * + * So we have + * max_ticks = MAX_CLOCK_SEC * 1000000000 * cycles_per_nsec + * max_ticks * clock_mult <= ULLONG_MAX + * max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX + * MULTIPLIER <= ULLONG_MAX * cycles_per_nsec / max_ticks + * + * Then choose the largest clock_shift that satisfies + * 2^clock_shift <= ULLONG_MAX * cycles_per_nsec / max_ticks + * + * Finally calculate the appropriate clock_mult associated with clock_shift + * clock_mult = 2^clock_shift / cycles_per_nsec + * + * 4) In the code below we have cycles_per_usec and use + * cycles_per_nsec = cycles_per_usec / 1000 + * + * + * The code below implements 4 clock tick to nsec conversion strategies + * + * i) 64-bit arithmetic for the (ticks * clock_mult) product with the + * conversion valid for at most MAX_CLOCK_SEC + * + * ii) NOT IMPLEMENTED Use 64-bit integers to emulate 128-bit multiplication + * for the (ticks * clock_mult) product + * + * iii) 64-bit arithmetic with clock ticks to nsec conversion occurring in + * two stages. The first stage counts the number of discrete, large chunks + * of time that have elapsed. To this is added the time represented by + * the remaining clock ticks. The advantage of this strategy is better + * accuracy because the (ticks * clock_mult) product used for final + * fractional chunk + * + * iv) 64-bit arithmetic with the clock ticks to nsec conversion occuring in + * two stages. This is carried out using locks to update the number of + * large time chunks (MAX_CLOCK_SEC_2STAGE) that have elapsed. + * + * v) 128-bit arithmetic used for the clock ticks to nsec conversion. + * + */ + +#include +#include +#include +#include +#include +#include "lib/seqlock.h" + +#define DEBUG 0 +#define MAX_CLOCK_SEC 365*24*60*60ULL +#define MAX_CLOCK_SEC_2STAGE 60*60ULL +#define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); } + +enum { + __CLOCK64_BIT = 1 << 0, + __CLOCK128_BIT = 1 << 1, + __CLOCK_MULT_SHIFT = 1 << 2, + __CLOCK_EMULATE_128 = 1 << 3, + __CLOCK_2STAGE = 1 << 4, + __CLOCK_LOCK = 1 << 5, + + CLOCK64_MULT_SHIFT = __CLOCK64_BIT | __CLOCK_MULT_SHIFT, + CLOCK64_EMULATE_128 = __CLOCK64_BIT | __CLOCK_EMULATE_128, + CLOCK64_2STAGE = __CLOCK64_BIT | __CLOCK_2STAGE, + CLOCK64_LOCK = __CLOCK64_BIT | __CLOCK_LOCK, + CLOCK128_MULT_SHIFT = __CLOCK128_BIT | __CLOCK_MULT_SHIFT, +}; + +static struct seqlock clock_seqlock; +static unsigned long long cycles_start; +static unsigned long long elapsed_nsec; + +static unsigned int max_cycles_shift; +static unsigned long long max_cycles_mask; +static unsigned long long nsecs_for_max_cycles; + +static unsigned int clock_shift; +static unsigned long long clock_mult; + +static unsigned long long *nsecs; +static unsigned long long clock_mult64_128[2]; +static __uint128_t clock_mult128; + +/* + * Functions for carrying out 128-bit + * arithmetic using 64-bit integers + * + * 128-bit integers are stored as + * arrays of two 64-bit integers + * + * Ordering is little endian + * + * a[0] has the less significant bits + * a[1] has the more significant bits + * + * NOT FULLY IMPLEMENTED + */ +static void do_mult(unsigned long long a[2], unsigned long long b, + unsigned long long product[2]) +{ + product[0] = product[1] = 0; + return; +} + +static void do_div(unsigned long long a[2], unsigned long long b, + unsigned long long c[2]) +{ + return; +} + +static void do_shift64(unsigned long long a[2], unsigned int count) +{ + a[0] = a[1] >> (count-64); + a[1] = 0; +} + +static void do_shift(unsigned long long a[2], unsigned int count) +{ + if (count > 64) + do_shift64(a, count); + else { + while (count--) { + a[0] >>= 1; + a[0] |= a[1] << 63; + a[1] >>= 1; + } + } +} + +static void update_clock(unsigned long long t) +{ + write_seqlock_begin(&clock_seqlock); + elapsed_nsec = (t >> max_cycles_shift) * nsecs_for_max_cycles; + cycles_start = t & ~max_cycles_mask; + write_seqlock_end(&clock_seqlock); +} + +static unsigned long long _get_nsec(int mode, unsigned long long t) +{ + switch(mode) { + case CLOCK64_MULT_SHIFT: + return (t * clock_mult) >> clock_shift; + case CLOCK64_EMULATE_128: { + unsigned long long product[2] = { }; + + do_mult(clock_mult64_128, t, product); + do_shift(product, clock_shift); + return product[0]; + } + case CLOCK64_2STAGE: { + unsigned long long multiples, nsec; + + multiples = t >> max_cycles_shift; + dprintf("multiples=%llu\n", multiples); + nsec = multiples * nsecs_for_max_cycles; + nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift; + return nsec; + } + case CLOCK64_LOCK: { + unsigned int seq; + unsigned long long nsec; + + do { + seq = read_seqlock_begin(&clock_seqlock); + nsec = elapsed_nsec; + nsec += ((t - cycles_start) * clock_mult) >> clock_shift; + } while (read_seqlock_retry(&clock_seqlock, seq)); + return nsec; + } + case CLOCK128_MULT_SHIFT: + return (unsigned long long)((t * clock_mult128) >> clock_shift); + default: + assert(0); + } +} + +static unsigned long long get_nsec(int mode, unsigned long long t) +{ + if (mode == CLOCK64_LOCK) { + update_clock(t); + } + + return _get_nsec(mode, t); +} + +static void calc_mult_shift(int mode, void *mult, unsigned int *shift, + unsigned long long max_sec, + unsigned long long cycles_per_usec) +{ + unsigned long long max_ticks; + max_ticks = max_sec * cycles_per_usec * 1000000ULL; + + switch (mode) { + case CLOCK64_MULT_SHIFT: { + unsigned long long max_mult, tmp; + unsigned int sft = 0; + + /* + * Calculate the largest multiplier that will not + * produce a 64-bit overflow in the multiplication + * step of the clock ticks to nsec conversion + */ + max_mult = ULLONG_MAX / max_ticks; + dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult); + + /* + * Find the largest shift count that will produce + * a multiplier less than max_mult + */ + tmp = max_mult * cycles_per_usec / 1000; + while (tmp > 1) { + tmp >>= 1; + sft++; + dprintf("tmp=%llu, sft=%u\n", tmp, sft); + } + + *shift = sft; + *((unsigned long long *)mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec); + break; + } + case CLOCK64_EMULATE_128: { + unsigned long long max_mult[2], tmp[2] = { }; + unsigned int sft = 0; + + /* + * Calculate the largest multiplier that will not + * produce a 128-bit overflow in the multiplication + * step of the clock ticks to nsec conversion, + * but use only 64-bit integers in the process + */ + max_mult[0] = max_mult[1] = ULLONG_MAX; + do_div(max_mult, max_ticks, max_mult); + dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n", + max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]); + + /* + * Find the largest shift count that will produce + * a multiplier less than max_mult + */ + do_div(max_mult, cycles_per_usec, tmp); + do_div(tmp, 1000ULL, tmp); + while (tmp[0] > 1 || tmp[1] > 1) { + do_shift(tmp, 1); + sft++; + dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft); + } + + *shift = sft; +// *((unsigned long long *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec); + break; + } + case CLOCK64_2STAGE: { + unsigned long long tmp; +/* + * This clock tick to nsec conversion requires two stages. + * + * Stage 1: Determine how many ~MAX_CLOCK_SEC_2STAGE periods worth of clock ticks + * have elapsed and set nsecs to the appropriate value for those + * ~MAX_CLOCK_SEC_2STAGE periods. + * Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC_2STAGE periods from + * Stage 1. Convert remaining clock ticks to nsecs and add to previously + * set nsec value. + * + * To optimize the arithmetic operations, use the greatest power of 2 ticks + * less than the number of ticks in MAX_CLOCK_SEC_2STAGE seconds. + * + */ + // Use a period shorter than MAX_CLOCK_SEC here for better accuracy + calc_mult_shift(CLOCK64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC_2STAGE, cycles_per_usec); + + // Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC_2STAGE + max_cycles_shift = max_cycles_mask = 0; + tmp = MAX_CLOCK_SEC_2STAGE * 1000000ULL * cycles_per_usec; + dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift); + while (tmp > 1) { + tmp >>= 1; + max_cycles_shift++; + dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift); + } + // if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will + // have a discontinuity every (1ULL << max_cycles_shift) cycles + nsecs_for_max_cycles = (1ULL << max_cycles_shift) * *((unsigned long long *)mult) >> *shift; + + // Use a bitmask to calculate ticks % (1ULL << max_cycles_shift) + for (tmp = 0; tmp < max_cycles_shift; tmp++) + max_cycles_mask |= 1ULL << tmp; + + dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n", + max_cycles_shift, (1ULL << max_cycles_shift), + nsecs_for_max_cycles, max_cycles_mask); + + + break; + } + case CLOCK64_LOCK: { +/* + * This clock tick to nsec conversion also requires two stages. + * + * Stage 1: Add to nsec the current running total of elapsed long periods + * Stage 2: Subtract from clock ticks the tick count corresponding to the + * most recently elapsed long period. Convert the remaining ticks to + * nsec and add to the previous nsec value. + * + * In practice the elapsed nsec from Stage 1 and the tick count subtracted + * in Stage 2 will be maintained in a separate thread. + * + */ + calc_mult_shift(CLOCK64_2STAGE, mult, shift, MAX_CLOCK_SEC, cycles_per_usec); + cycles_start = 0; + break; + } + case CLOCK128_MULT_SHIFT: { + __uint128_t max_mult, tmp; + unsigned int sft = 0; + + /* + * Calculate the largest multiplier that will not + * produce a 128-bit overflow in the multiplication + * step of the clock ticks to nsec conversion + */ + max_mult = ((__uint128_t) ULLONG_MAX) << 64 | ULLONG_MAX; + max_mult /= max_ticks; + dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n", + max_ticks, __builtin_clzll(max_ticks), + (unsigned long long) (max_mult >> 64), + (unsigned long long) max_mult); + + /* + * Find the largest shift count that will produce + * a multiplier less than max_mult + */ + tmp = max_mult * cycles_per_usec / 1000; + while (tmp > 1) { + tmp >>= 1; + sft++; + dprintf("tmp=0x%016llx%016llx, sft=%u\n", + (unsigned long long) (tmp >> 64), + (unsigned long long) tmp, sft); + } + + *shift = sft; + *((__uint128_t *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec); + break; + } + } +} + +static int discontinuity(int mode, int delta_ticks, int delta_nsec, + unsigned long long start, unsigned long len) +{ + int i; + unsigned long mismatches = 0, bad_mismatches = 0; + unsigned long long delta, max_mismatch = 0; + unsigned long long *ns = nsecs; + + for (i = 0; i < len; ns++, i++) { + *ns = get_nsec(mode, start + i); + if (i - delta_ticks >= 0) { + if (*ns > *(ns - delta_ticks)) + delta = *ns - *(ns - delta_ticks); + else + delta = *(ns - delta_ticks) - *ns; + if (delta > delta_nsec) + delta -= delta_nsec; + else + delta = delta_nsec - delta; + if (delta) { + mismatches++; + if (delta > 1) + bad_mismatches++; + if (delta > max_mismatch) + max_mismatch = delta; + } + } + if (!bad_mismatches) + assert(max_mismatch == 0 || max_mismatch == 1); + if (!mismatches) + assert(max_mismatch == 0); + } + + printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n", + mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start, + start + len - 1); + return mismatches; +} + +#define MIN_TICKS 1ULL +#define LEN 1000000000ULL +#define NSEC_ONE_SEC 1000000000ULL +#define TESTLEN 9 + +static long long test_clock(int mode, int cycles_per_usec, int fast_test, + int quiet, int delta_ticks, int delta_nsec) +{ + int i; + long long delta; + unsigned long long max_ticks; + unsigned long long nsecs; + void *mult; + unsigned long long test_ns[TESTLEN] = + {NSEC_ONE_SEC, NSEC_ONE_SEC, + NSEC_ONE_SEC, NSEC_ONE_SEC*60, NSEC_ONE_SEC*60*60, + NSEC_ONE_SEC*60*60*2, NSEC_ONE_SEC*60*60*4, + NSEC_ONE_SEC*60*60*8, NSEC_ONE_SEC*60*60*24}; + unsigned long long test_ticks[TESTLEN]; + + max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL; + + switch(mode) { + case CLOCK64_MULT_SHIFT: + mult = &clock_mult; + break; + case CLOCK64_EMULATE_128: + mult = clock_mult64_128; + break; + case CLOCK64_2STAGE: + mult = &clock_mult; + break; + case CLOCK64_LOCK: + mult = &clock_mult; + break; + case CLOCK128_MULT_SHIFT: + mult = &clock_mult128; + break; + default: + assert(0); + } + calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec); + nsecs = get_nsec(mode, max_ticks); + delta = nsecs/1000000 - MAX_CLOCK_SEC*1000; + + if (mode == CLOCK64_2STAGE) { + test_ns[0] = nsecs_for_max_cycles - 1; + test_ns[1] = nsecs_for_max_cycles; + test_ticks[0] = (1ULL << max_cycles_shift) - 1; + test_ticks[1] = (1ULL << max_cycles_shift); + + for (i = 2; i < TESTLEN; i++) + test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec; + } + else { + for (i = 0; i < TESTLEN; i++) + test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec; + } + + if (!quiet) { + printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n", + cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift)); + switch(mode) { + case CLOCK64_LOCK: + case CLOCK64_2STAGE: + case CLOCK64_MULT_SHIFT: { + printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n", + clock_mult, (double) clock_mult / (1ULL << clock_shift)); + break; + } + case CLOCK64_EMULATE_128: { + printf("clock_mult=0x%016llx%016llx\n", + clock_mult64_128[1], clock_mult64_128[0]); + break; + } + case CLOCK128_MULT_SHIFT: { + printf("clock_mult=0x%016llx%016llx\n", + (unsigned long long) (clock_mult128 >> 64), + (unsigned long long) clock_mult128); + break; + } + } + printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n", + nsecs, MAX_CLOCK_SEC*1000000000ULL, delta); + } + + for (i = 0; i < TESTLEN; i++) + { + nsecs = get_nsec(mode, test_ticks[i]); + delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs; + if (!quiet || delta > 0) + printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n", + test_ticks[i], nsecs, test_ns[i], delta); + } + + if (!fast_test) { + discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN); + discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN); + } + + if (!quiet) + printf("\n\n"); + + return delta; +} + +int main(int argc, char *argv[]) +{ + nsecs = malloc(LEN * sizeof(unsigned long long)); + + test_clock(CLOCK64_LOCK, 3333, 1, 0, 0, 0); + test_clock(CLOCK64_LOCK, 1000, 1, 0, 1, 1); + test_clock(CLOCK64_LOCK, 1100, 1, 0, 11, 10); + test_clock(CLOCK64_LOCK, 3000, 1, 0, 3, 1); + test_clock(CLOCK64_LOCK, 3333, 1, 0, 3333, 1000); + test_clock(CLOCK64_LOCK, 3392, 1, 0, 424, 125); + test_clock(CLOCK64_LOCK, 4500, 1, 0, 9, 2); + test_clock(CLOCK64_LOCK, 5000, 1, 0, 5, 1); + + free(nsecs); + return 0; +} diff -Nru fio-2.16/t/verify-state.c fio-3.1/t/verify-state.c --- fio-2.16/t/verify-state.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/t/verify-state.c 2017-09-28 10:23:20.000000000 +0000 @@ -58,7 +58,8 @@ show_s(s, no_s); no_s++; size -= __thread_io_list_sz(s->depth, s->nofiles); - s = (void *) s + __thread_io_list_sz(s->depth, s->nofiles); + s = (struct thread_io_list *)((char *) s + + __thread_io_list_sz(s->depth, s->nofiles)); } while (size != 0); } diff -Nru fio-2.16/td_error.c fio-3.1/td_error.c --- fio-2.16/td_error.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/td_error.c 2017-09-28 10:23:20.000000000 +0000 @@ -20,8 +20,7 @@ if (!td->o.ignore_error[etype]) { td->o.ignore_error[etype] = __NON_FATAL_ERR; - td->o.ignore_error_nr[etype] = sizeof(__NON_FATAL_ERR) - / sizeof(int); + td->o.ignore_error_nr[etype] = ARRAY_SIZE(__NON_FATAL_ERR); } if (!(td->o.continue_on_error & (1 << etype))) diff -Nru fio-2.16/td_error.h fio-3.1/td_error.h --- fio-2.16/td_error.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/td_error.h 2017-09-28 10:23:20.000000000 +0000 @@ -2,7 +2,8 @@ #define FIO_TD_ERROR_H /* - * What type of errors to continue on when continue_on_error is used + * What type of errors to continue on when continue_on_error is used, + * and what type of errors to ignore when ignore_error is used. */ enum error_type_bit { ERROR_TYPE_READ_BIT = 0, diff -Nru fio-2.16/thread_options.h fio-3.1/thread_options.h --- fio-2.16/thread_options.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/thread_options.h 2017-09-28 10:23:20.000000000 +0000 @@ -20,6 +20,7 @@ MEM_MMAP, /* use anonynomous mmap */ MEM_MMAPHUGE, /* memory mapped huge file */ MEM_MMAPSHARED, /* use mmap with shared flag */ + MEM_CUDA_MALLOC,/* use GPU memory */ }; #define ERROR_STR_MAX 128 @@ -52,6 +53,7 @@ char *filename_format; char *opendir; char *ioengine; + char *ioengine_so_path; char *mmapfile; enum td_ddir td_ddir; unsigned int rw_seq; @@ -64,11 +66,12 @@ unsigned int iodepth_batch; unsigned int iodepth_batch_complete_min; unsigned int iodepth_batch_complete_max; + unsigned int serialize_overlap; unsigned int unique_filename; unsigned long long size; - unsigned long long io_limit; + unsigned long long io_size; unsigned int size_percent; unsigned int fill_device; unsigned int file_append; @@ -101,6 +104,7 @@ unsigned int end_fsync; unsigned int pre_read; unsigned int sync_io; + unsigned int write_hint; unsigned int verify; unsigned int do_verify; unsigned int verifysort; @@ -198,6 +202,9 @@ unsigned short numa_mem_mode; unsigned int numa_mem_prefer_node; char *numa_memnodes; + unsigned int gpu_dev_id; + unsigned int start_offset_percent; + unsigned int iolog; unsigned int rwmixcycle; unsigned int rwmix[DDIR_RWDIR_CNT]; @@ -206,6 +213,7 @@ unsigned int ioprio_class; unsigned int file_service_type; unsigned int group_reporting; + unsigned int stats; unsigned int fadvise_hint; unsigned int fadvise_stream; enum fio_fallocate_mode fallocate_mode; @@ -232,6 +240,7 @@ unsigned int trim_zero; unsigned long long trim_backlog; unsigned int clat_percentiles; + unsigned int lat_percentiles; unsigned int percentile_precision; /* digits after decimal for percentiles */ fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN]; @@ -300,7 +309,6 @@ fio_fp64_t latency_percentile; unsigned block_error_hist; - unsigned int skip_bad; unsigned int replay_align; unsigned int replay_scale; @@ -335,10 +343,11 @@ uint32_t iodepth_batch; uint32_t iodepth_batch_complete_min; uint32_t iodepth_batch_complete_max; - uint32_t __proper_alignment_for_64b; + uint32_t serialize_overlap; + uint32_t lat_percentiles; uint64_t size; - uint64_t io_limit; + uint64_t io_size; uint32_t size_percent; uint32_t fill_device; uint32_t file_append; @@ -372,6 +381,7 @@ uint32_t end_fsync; uint32_t pre_read; uint32_t sync_io; + uint32_t write_hint; uint32_t verify; uint32_t do_verify; uint32_t verifysort; @@ -410,10 +420,10 @@ uint32_t bs_unaligned; uint32_t fsync_on_close; uint32_t bs_is_seq_rand; - uint32_t pad1; uint32_t random_distribution; uint32_t exitall_error; + uint32_t pad; struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX]; uint32_t zone_split_nr[DDIR_RWDIR_CNT]; @@ -466,6 +476,8 @@ uint8_t verify_cpumask[FIO_TOP_STR_MAX]; uint8_t log_gz_cpumask[FIO_TOP_STR_MAX]; #endif + uint32_t gpu_dev_id; + uint32_t start_offset_percent; uint32_t cpus_allowed_policy; uint32_t iolog; uint32_t rwmixcycle; @@ -475,6 +487,7 @@ uint32_t ioprio_class; uint32_t file_service_type; uint32_t group_reporting; + uint32_t stats; uint32_t fadvise_hint; uint32_t fadvise_stream; uint32_t fallocate_mode; @@ -502,7 +515,6 @@ uint64_t trim_backlog; uint32_t clat_percentiles; uint32_t percentile_precision; - uint32_t padding; /* REMOVE ME when possible to maintain alignment */ fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN]; uint8_t read_iolog_file[FIO_TOP_STR_MAX]; @@ -571,7 +583,6 @@ fio_fp64_t latency_percentile; uint32_t block_error_hist; - uint32_t skip_bad; uint32_t replay_align; uint32_t replay_scale; diff -Nru fio-2.16/time.c fio-3.1/time.c --- fio-2.16/time.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/time.c 2017-09-28 10:23:20.000000000 +0000 @@ -3,15 +3,23 @@ #include "fio.h" -static struct timeval genesis; +static struct timespec genesis; static unsigned long ns_granularity; -void timeval_add_msec(struct timeval *tv, unsigned int msec) +void timespec_add_msec(struct timespec *ts, unsigned int msec) { - tv->tv_usec += 1000 * msec; - if (tv->tv_usec >= 1000000) { - tv->tv_usec -= 1000000; - tv->tv_sec++; + uint64_t adj_nsec = 1000000ULL * msec; + + ts->tv_nsec += adj_nsec; + if (adj_nsec >= 1000000000) { + uint64_t adj_sec = adj_nsec / 1000000000; + + ts->tv_nsec -= adj_sec * 1000000000; + ts->tv_sec += adj_sec; + } + if (ts->tv_nsec >= 1000000000){ + ts->tv_nsec -= 1000000000; + ts->tv_sec++; } } @@ -20,7 +28,7 @@ */ uint64_t usec_spin(unsigned int usec) { - struct timeval start; + struct timespec start; uint64_t t; fio_gettime(&start, NULL); @@ -33,7 +41,7 @@ uint64_t usec_sleep(struct thread_data *td, unsigned long usec) { struct timespec req; - struct timeval tv; + struct timespec tv; uint64_t t = 0; do { @@ -89,31 +97,37 @@ return td->o.ramp_time && !td->ramp_time_over; } -static void parent_update_ramp(struct thread_data *td) +static bool parent_update_ramp(struct thread_data *td) { struct thread_data *parent = td->parent; if (!parent || parent->ramp_time_over) - return; + return false; reset_all_stats(parent); - parent->ramp_time_over = 1; + parent->ramp_time_over = true; td_set_runstate(parent, TD_RAMP); + return true; } bool ramp_time_over(struct thread_data *td) { - struct timeval tv; - if (!td->o.ramp_time || td->ramp_time_over) return true; - fio_gettime(&tv, NULL); - if (utime_since(&td->epoch, &tv) >= td->o.ramp_time) { - td->ramp_time_over = 1; + if (utime_since_now(&td->epoch) >= td->o.ramp_time) { + td->ramp_time_over = true; reset_all_stats(td); td_set_runstate(td, TD_RAMP); - parent_update_ramp(td); + + /* + * If we have a parent, the parent isn't doing IO. Hence + * the parent never enters do_io(), which will switch us + * from RAMP -> RUNNING. Do this manually here. + */ + if (parent_update_ramp(td)) + td_set_runstate(td, TD_RUNNING); + return true; } @@ -130,8 +144,7 @@ * Check the granularity of the nanosleep function */ for (i = 0; i < 10; i++) { - struct timeval tv; - struct timespec ts; + struct timespec tv, ts; unsigned long elapsed; fio_gettime(&tv, NULL); @@ -162,7 +175,7 @@ } } -void fill_start_time(struct timeval *t) +void fill_start_time(struct timespec *t) { memcpy(t, &genesis, sizeof(genesis)); } diff -Nru fio-2.16/tools/fio_jsonplus_clat2csv fio-3.1/tools/fio_jsonplus_clat2csv --- fio-2.16/tools/fio_jsonplus_clat2csv 1970-01-01 00:00:00.000000000 +0000 +++ fio-3.1/tools/fio_jsonplus_clat2csv 2017-09-28 10:23:20.000000000 +0000 @@ -0,0 +1,164 @@ +#!/usr/bin/python +# +# fio_jsonplus_clat2csv +# +# This script converts fio's json+ completion latency data to CSV format. +# +# For example: +# +# Run the following fio jobs: +# ../fio --output=fio-jsonplus.output --output-format=json+ --name=test1 +# --ioengine=null --time_based --runtime=5s --size=1G --rw=randrw +# --name=test2 --ioengine=null --time_based --runtime=3s --size=1G +# --rw=read --name=test3 --ioengine=null --time_based --runtime=4s +# --size=8G --rw=write +# +# Then run: +# fio_jsonplus_clat2csv fio-jsonplus.output fio-latency.csv +# +# You will end up with the following 3 files +# +# -rw-r--r-- 1 root root 6467 Jun 27 14:57 fio-latency_job0.csv +# -rw-r--r-- 1 root root 3985 Jun 27 14:57 fio-latency_job1.csv +# -rw-r--r-- 1 root root 4490 Jun 27 14:57 fio-latency_job2.csv +# +# fio-latency_job0.csv will look something like: +# +# clat_nsec, read_count, read_cumulative, read_percentile, write_count, +# write_cumulative, write_percentile, trim_count, trim_cumulative, +# trim_percentile, +# 25, 1, 1, 1.50870705013e-07, , , , , , , +# 26, 12, 13, 1.96131916517e-06, 947, 947, 0.000142955890032, , , , +# 27, 843677, 843690, 0.127288105112, 838347, 839294, 0.126696959629, , , , +# 28, 1877982, 2721672, 0.410620573454, 1870189, 2709483, 0.409014312345, , , , +# 29, 4471, 2726143, 0.411295116376, 7718, 2717201, 0.410179395301, , , , +# 30, 2142885, 4869028, 0.734593687087, 2138164, 4855365, 0.732949340025, , , , +# ... +# 2544, , , , 2, 6624404, 0.999997433738, , , , +# 2576, 3, 6628178, 0.99999788781, 4, 6624408, 0.999998037564, , , , +# 2608, 4, 6628182, 0.999998491293, 4, 6624412, 0.999998641391, , , , +# 2640, 3, 6628185, 0.999998943905, 2, 6624414, 0.999998943304, , , , +# 2672, 1, 6628186, 0.999999094776, 3, 6624417, 0.999999396174, , , , +# 2736, 1, 6628187, 0.999999245646, 1, 6624418, 0.99999954713, , , , +# 2768, 2, 6628189, 0.999999547388, 1, 6624419, 0.999999698087, , , , +# 2800, , , , 1, 6624420, 0.999999849043, , , , +# 2832, 1, 6628190, 0.999999698259, , , , , , , +# 4192, 1, 6628191, 0.999999849129, , , , , , , +# 5792, , , , 1, 6624421, 1.0, , , , +# 10304, 1, 6628192, 1.0, , , , , , , +# +# The first line says that you had one read IO with 25ns clat, +# the cumulative number of read IOs at or below 25ns is 1, and +# 25ns is the 0.00001509th percentile for read latency +# +# The job had 2 write IOs complete in 2544ns, +# 6624404 write IOs completed in 2544ns or less, +# and this represents the 99.99974th percentile for write latency +# +# The last line says that one read IO had 10304ns clat, +# 6628192 read IOs had 10304ns or shorter clat, and +# 10304ns is the 100th percentile for read latency +# + +import os +import json +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('source', + help='fio json+ output file containing completion ' + 'latency data') + parser.add_argument('dest', + help='destination file stub for latency data in CSV ' + 'format. job number will be appended to filename') + args = parser.parse_args() + + return args + + +def percentile(idx, run_total): + total = run_total[len(run_total)-1] + if total == 0: + return 0 + + return float(run_total[idx]) / total + + +def more_lines(indices, bins): + for key, value in indices.iteritems(): + if value < len(bins[key]): + return True + + return False + + +def main(): + args = parse_args() + + with open(args.source, 'r') as source: + jsondata = json.loads(source.read()) + + for jobnum in range(0, len(jsondata['jobs'])): + bins = {} + run_total = {} + ddir_set = set(['read', 'write', 'trim']) + + prev_ddir = None + for ddir in ddir_set: + bins[ddir] = [[int(key), value] for key, value in + jsondata['jobs'][jobnum][ddir]['clat_ns'] + ['bins'].iteritems()] + bins[ddir] = sorted(bins[ddir], key=lambda bin: bin[0]) + + run_total[ddir] = [0 for x in range(0, len(bins[ddir]))] + if len(bins[ddir]) > 0: + run_total[ddir][0] = bins[ddir][0][1] + for x in range(1, len(bins[ddir])): + run_total[ddir][x] = run_total[ddir][x-1] + \ + bins[ddir][x][1] + + stub, ext = os.path.splitext(args.dest) + outfile = stub + '_job' + str(jobnum) + ext + + with open(outfile, 'w') as output: + output.write("clat_nsec, ") + ddir_list = list(ddir_set) + for ddir in ddir_list: + output.write("{0}_count, {0}_cumulative, {0}_percentile, ". + format(ddir)) + output.write("\n") + +# +# Have a counter for each ddir +# In each round, pick the shortest remaining duration +# and output a line with any values for that duration +# + indices = {x: 0 for x in ddir_list} + while more_lines(indices, bins): + min_lat = 17112760320 + for ddir in ddir_list: + if indices[ddir] < len(bins[ddir]): + min_lat = min(bins[ddir][indices[ddir]][0], min_lat) + + output.write("{0}, ".format(min_lat)) + + for ddir in ddir_list: + if indices[ddir] < len(bins[ddir]) and \ + min_lat == bins[ddir][indices[ddir]][0]: + count = bins[ddir][indices[ddir]][1] + cumulative = run_total[ddir][indices[ddir]] + ptile = percentile(indices[ddir], run_total[ddir]) + output.write("{0}, {1}, {2}, ".format(count, + cumulative, ptile)) + indices[ddir] += 1 + else: + output.write(", , , ") + output.write("\n") + + print "{0} generated".format(outfile) + + +if __name__ == '__main__': + main() diff -Nru fio-2.16/tools/fio_latency2csv.py fio-3.1/tools/fio_latency2csv.py --- fio-2.16/tools/fio_latency2csv.py 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/tools/fio_latency2csv.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,101 +0,0 @@ -#!/usr/bin/python -# -# fio_latency2csv.py -# -# This tool converts fio's json+ completion latency data to CSV format. -# For example: -# -# fio_latency2csv.py fio-jsonplus.output fio-latency.csv -# - -import os -import json -import argparse - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument('source', - help='fio json+ output file containing completion ' - 'latency data') - parser.add_argument('dest', - help='destination file stub for latency data in CSV ' - 'format. job number will be appended to filename') - args = parser.parse_args() - - return args - - -# from stat.c -def plat_idx_to_val(idx, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64): - # MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use - # all bits of the sample as index - if (idx < (FIO_IO_U_PLAT_VAL << 1)): - return idx - - # Find the group and compute the minimum value of that group - error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1 - base = 1 << (error_bits + FIO_IO_U_PLAT_BITS) - - # Find its bucket number of the group - k = idx % FIO_IO_U_PLAT_VAL - - # Return the mean of the range of the bucket - return (base + ((k + 0.5) * (1 << error_bits))) - - -def percentile(idx, run_total): - total = run_total[len(run_total)-1] - if total == 0: - return 0 - - return float(run_total[x]) / total - - -if __name__ == '__main__': - args = parse_args() - - with open(args.source, 'r') as source: - jsondata = json.loads(source.read()) - - bins = {} - bin_const = {} - run_total = {} - ddir_list = ['read', 'write', 'trim'] - const_list = ['FIO_IO_U_PLAT_NR', 'FIO_IO_U_PLAT_BITS', - 'FIO_IO_U_PLAT_VAL'] - - for jobnum in range(0,len(jsondata['jobs'])): - prev_ddir = None - for ddir in ddir_list: - bins[ddir] = jsondata['jobs'][jobnum][ddir]['clat']['bins'] - - bin_const[ddir] = {} - for const in const_list: - bin_const[ddir][const] = bins[ddir].pop(const) - if prev_ddir: - assert bin_const[ddir][const] == bin_const[prev_ddir][const] - prev_ddir = ddir - - run_total[ddir] = [0 for x in - range(bin_const[ddir]['FIO_IO_U_PLAT_NR'])] - run_total[ddir][0] = bins[ddir]['0'] - for x in range(1, bin_const[ddir]['FIO_IO_U_PLAT_NR']): - run_total[ddir][x] = run_total[ddir][x-1] + bins[ddir][str(x)] - - stub, ext = os.path.splitext(args.dest) - outfile = stub + '_job' + str(jobnum) + ext - - with open(outfile, 'w') as output: - output.write("clat (usec),") - for ddir in ddir_list: - output.write("{0},".format(ddir)) - output.write("\n") - - for x in range(bin_const['read']['FIO_IO_U_PLAT_NR']): - output.write("{0},".format(plat_idx_to_val(x, - bin_const['read']['FIO_IO_U_PLAT_BITS'], - bin_const['read']['FIO_IO_U_PLAT_VAL']))) - for ddir in ddir_list: - output.write("{0},".format(percentile(x, run_total[ddir]))) - output.write("\n") diff -Nru fio-2.16/tools/fiologparser.py fio-3.1/tools/fiologparser.py --- fio-2.16/tools/fiologparser.py 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/tools/fiologparser.py 2017-09-28 10:23:20.000000000 +0000 @@ -45,7 +45,7 @@ while (start < ftime): end = ftime if ftime < end else end results = [ts.get_value(start, end) for ts in series] - print "%s, %s" % (end, ', '.join(["%0.3f" % i for i in results])) + print("%s, %s" % (end, ', '.join(["%0.3f" % i for i in results]))) start += ctx.interval end += ctx.interval @@ -57,7 +57,7 @@ while (start < ftime): end = ftime if ftime < end else end results = [ts.get_value(start, end) for ts in series] - print "%s, %0.3f" % (end, sum(results)) + print("%s, %0.3f" % (end, sum(results))) start += ctx.interval end += ctx.interval @@ -69,7 +69,7 @@ while (start < ftime): end = ftime if ftime < end else end results = [ts.get_value(start, end) for ts in series] - print "%s, %0.3f" % (end, float(sum(results))/len(results)) + print("%s, %0.3f" % (end, float(sum(results))/len(results))) start += ctx.interval end += ctx.interval @@ -147,11 +147,11 @@ end += ctx.interval total = 0 - for i in xrange(0, len(averages)): + for i in range(0, len(averages)): total += averages[i]*weights[i] - print '%0.3f' % (total/sum(weights)) + print('%0.3f' % (total/sum(weights))) -class TimeSeries(): +class TimeSeries(object): def __init__(self, ctx, fn): self.ctx = ctx self.last = None @@ -185,7 +185,7 @@ value += sample.get_contribution(start, end) return value -class Sample(): +class Sample(object): def __init__(self, ctx, start, end, value): self.ctx = ctx self.start = start diff -Nru fio-2.16/tools/hist/fiologparser_hist.py fio-3.1/tools/hist/fiologparser_hist.py --- fio-2.16/tools/hist/fiologparser_hist.py 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/tools/hist/fiologparser_hist.py 2017-09-28 10:23:20.000000000 +0000 @@ -373,7 +373,7 @@ help='print warning messages to stderr') arg('--group_nr', - default=19, + default=29, type=int, help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h') diff -Nru fio-2.16/tools/plot/fio2gnuplot.1 fio-3.1/tools/plot/fio2gnuplot.1 --- fio-2.16/tools/plot/fio2gnuplot.1 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/tools/plot/fio2gnuplot.1 2017-09-28 10:23:20.000000000 +0000 @@ -1,5 +1,5 @@ .\" Text automatically generated by txt2man -.TH fio2gnuplot "07 août 2013" "" "" +.TH fio2gnuplot 1 "August 2013" .SH NAME \fBfio2gnuplot \fP- Render fio's output files with gnuplot .SH SYNOPSIS diff -Nru fio-2.16/.travis.yml fio-3.1/.travis.yml --- fio-2.16/.travis.yml 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/.travis.yml 2017-09-28 10:23:20.000000000 +0000 @@ -1,7 +1,51 @@ language: c +os: + - linux compiler: - clang - gcc +env: + matrix: + - BUILD_ARCH="x86" + - BUILD_ARCH="x86_64" + global: + - MAKEFLAGS="-j 2" +matrix: + include: + - os: osx + compiler: clang # Workaround travis setting CC=["clang", "gcc"] + env: BUILD_ARCH="x86_64" + # Build using the 10.12 SDK but target and run on OSX 10.11 +# - os: osx +# compiler: clang +# osx_image: xcode8 +# env: SDKROOT=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk MACOSX_DEPLOYMENT_TARGET=10.11 + # Build on the latest OSX version (will eventually become obsolete) + - os: osx + compiler: clang + osx_image: xcode8.3 + env: BUILD_ARCH="x86_64" + exclude: + - os: osx + compiler: gcc + exclude: + - os: linux + compiler: clang + env: BUILD_ARCH="x86" # Only do the gcc x86 build to reduce clutter before_install: - - sudo apt-get -qq update - - sudo apt-get install -qq -y libaio-dev libnuma-dev libz-dev + - EXTRA_CFLAGS="-Werror" + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + pkgs=(libaio-dev libnuma-dev libz-dev librbd-dev libibverbs-dev librdmacm-dev); + if [[ "$BUILD_ARCH" == "x86" ]]; then + pkgs=("${pkgs[@]/%/:i386}"); + pkgs+=(gcc-multilib); + EXTRA_CFLAGS="${EXTRA_CFLAGS} -m32"; + else + pkgs+=(glusterfs-common); + fi; + sudo apt-get -qq update; + sudo apt-get install --no-install-recommends -qq -y "${pkgs[@]}"; + fi +script: + - ./configure --extra-cflags="${EXTRA_CFLAGS}" && make + - make test diff -Nru fio-2.16/unit_tests/steadystate_tests.py fio-3.1/unit_tests/steadystate_tests.py --- fio-2.16/unit_tests/steadystate_tests.py 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/unit_tests/steadystate_tests.py 2017-09-28 10:23:20.000000000 +0000 @@ -115,7 +115,7 @@ if args.read == None: if os.name == 'posix': args.read = '/dev/zero' - extra = [ "--size=128M" ] + extra = [ "--size=134217728" ] # 128 MiB else: print "ERROR: file for read testing must be specified on non-posix systems" sys.exit(1) diff -Nru fio-2.16/verify.c fio-3.1/verify.c --- fio-2.16/verify.c 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/verify.c 2017-09-28 10:23:20.000000000 +0000 @@ -25,6 +25,7 @@ #include "crc/sha512.h" #include "crc/sha1.h" #include "crc/xxhash.h" +#include "crc/sha3.h" static void populate_hdr(struct thread_data *td, struct io_u *io_u, struct verify_header *hdr, unsigned int header_num, @@ -172,6 +173,18 @@ case VERIFY_SHA512: len = sizeof(struct vhdr_sha512); break; + case VERIFY_SHA3_224: + len = sizeof(struct vhdr_sha3_224); + break; + case VERIFY_SHA3_256: + len = sizeof(struct vhdr_sha3_256); + break; + case VERIFY_SHA3_384: + len = sizeof(struct vhdr_sha3_384); + break; + case VERIFY_SHA3_512: + len = sizeof(struct vhdr_sha3_512); + break; case VERIFY_XXHASH: len = sizeof(struct vhdr_xxhash); break; @@ -258,6 +271,7 @@ fd = open(fname, O_CREAT | O_TRUNC | O_WRONLY, 0644); if (fd < 0) { perror("open verify buf file"); + free(ptr); return; } @@ -374,7 +388,7 @@ (void)paste_format_inplace(pattern, pattern_size, td->o.verify_fmt, td->o.verify_fmt_sz, io_u); - buf = (void *) hdr + header_size; + buf = (char *) hdr + header_size; len = get_hdr_inc(td, io_u) - header_size; mod = (get_hdr_inc(td, io_u) * vc->hdr_num + header_size) % pattern_size; @@ -393,7 +407,8 @@ (unsigned char)pattern[mod], bits); log_err("fio: bad pattern block offset %u\n", i); - dump_verify_buffers(hdr, vc); + vc->name = "pattern"; + log_verify_failure(hdr, vc); return EILSEQ; } mod++; @@ -430,6 +445,84 @@ return EILSEQ; } +static int verify_io_u_sha3(struct verify_header *hdr, struct vcont *vc, + struct fio_sha3_ctx *sha3_ctx, uint8_t *sha, + unsigned int sha_size, const char *name) +{ + void *p = io_u_verify_off(hdr, vc); + + dprint(FD_VERIFY, "%s verify io_u %p, len %u\n", name, vc->io_u, hdr->len); + + fio_sha3_update(sha3_ctx, p, hdr->len - hdr_size(vc->td, hdr)); + fio_sha3_final(sha3_ctx); + + if (!memcmp(sha, sha3_ctx->sha, sha_size)) + return 0; + + vc->name = name; + vc->good_crc = sha; + vc->bad_crc = sha3_ctx->sha; + vc->crc_len = sha_size; + log_verify_failure(hdr, vc); + return EILSEQ; +} + +static int verify_io_u_sha3_224(struct verify_header *hdr, struct vcont *vc) +{ + struct vhdr_sha3_224 *vh = hdr_priv(hdr); + uint8_t sha[SHA3_224_DIGEST_SIZE]; + struct fio_sha3_ctx sha3_ctx = { + .sha = sha, + }; + + fio_sha3_224_init(&sha3_ctx); + + return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha, + SHA3_224_DIGEST_SIZE, "sha3-224"); +} + +static int verify_io_u_sha3_256(struct verify_header *hdr, struct vcont *vc) +{ + struct vhdr_sha3_256 *vh = hdr_priv(hdr); + uint8_t sha[SHA3_256_DIGEST_SIZE]; + struct fio_sha3_ctx sha3_ctx = { + .sha = sha, + }; + + fio_sha3_256_init(&sha3_ctx); + + return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha, + SHA3_256_DIGEST_SIZE, "sha3-256"); +} + +static int verify_io_u_sha3_384(struct verify_header *hdr, struct vcont *vc) +{ + struct vhdr_sha3_384 *vh = hdr_priv(hdr); + uint8_t sha[SHA3_384_DIGEST_SIZE]; + struct fio_sha3_ctx sha3_ctx = { + .sha = sha, + }; + + fio_sha3_384_init(&sha3_ctx); + + return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha, + SHA3_384_DIGEST_SIZE, "sha3-384"); +} + +static int verify_io_u_sha3_512(struct verify_header *hdr, struct vcont *vc) +{ + struct vhdr_sha3_512 *vh = hdr_priv(hdr); + uint8_t sha[SHA3_512_DIGEST_SIZE]; + struct fio_sha3_ctx sha3_ctx = { + .sha = sha, + }; + + fio_sha3_512_init(&sha3_ctx); + + return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha, + SHA3_512_DIGEST_SIZE, "sha3-512"); +} + static int verify_io_u_sha512(struct verify_header *hdr, struct vcont *vc) { void *p = io_u_verify_off(hdr, vc); @@ -759,7 +852,7 @@ * state of numberio, that would have been written to each block * in a previous run of fio, has been reached. */ - if ((td_write(td) || td_rw(td)) && (td_min_bs(td) == td_max_bs(td)) && + if (td_write(td) && (td_min_bs(td) == td_max_bs(td)) && !td->o.time_based) if (!td->o.verify_only || td->o.loops == 0) if (hdr->numberio != io_u->numberio) { @@ -881,6 +974,18 @@ case VERIFY_SHA512: ret = verify_io_u_sha512(hdr, &vc); break; + case VERIFY_SHA3_224: + ret = verify_io_u_sha3_224(hdr, &vc); + break; + case VERIFY_SHA3_256: + ret = verify_io_u_sha3_256(hdr, &vc); + break; + case VERIFY_SHA3_384: + ret = verify_io_u_sha3_384(hdr, &vc); + break; + case VERIFY_SHA3_512: + ret = verify_io_u_sha3_512(hdr, &vc); + break; case VERIFY_XXHASH: ret = verify_io_u_xxhash(hdr, &vc); break; @@ -918,6 +1023,56 @@ vh->hash = XXH32_digest(state); } +static void fill_sha3(struct fio_sha3_ctx *sha3_ctx, void *p, unsigned int len) +{ + fio_sha3_update(sha3_ctx, p, len); + fio_sha3_final(sha3_ctx); +} + +static void fill_sha3_224(struct verify_header *hdr, void *p, unsigned int len) +{ + struct vhdr_sha3_224 *vh = hdr_priv(hdr); + struct fio_sha3_ctx sha3_ctx = { + .sha = vh->sha, + }; + + fio_sha3_224_init(&sha3_ctx); + fill_sha3(&sha3_ctx, p, len); +} + +static void fill_sha3_256(struct verify_header *hdr, void *p, unsigned int len) +{ + struct vhdr_sha3_256 *vh = hdr_priv(hdr); + struct fio_sha3_ctx sha3_ctx = { + .sha = vh->sha, + }; + + fio_sha3_256_init(&sha3_ctx); + fill_sha3(&sha3_ctx, p, len); +} + +static void fill_sha3_384(struct verify_header *hdr, void *p, unsigned int len) +{ + struct vhdr_sha3_384 *vh = hdr_priv(hdr); + struct fio_sha3_ctx sha3_ctx = { + .sha = vh->sha, + }; + + fio_sha3_384_init(&sha3_ctx); + fill_sha3(&sha3_ctx, p, len); +} + +static void fill_sha3_512(struct verify_header *hdr, void *p, unsigned int len) +{ + struct vhdr_sha3_512 *vh = hdr_priv(hdr); + struct fio_sha3_ctx sha3_ctx = { + .sha = vh->sha, + }; + + fio_sha3_512_init(&sha3_ctx); + fill_sha3(&sha3_ctx, p, len); +} + static void fill_sha512(struct verify_header *hdr, void *p, unsigned int len) { struct vhdr_sha512 *vh = hdr_priv(hdr); @@ -1012,7 +1167,7 @@ hdr->rand_seed = rand_seed; hdr->offset = io_u->offset + header_num * td->o.verify_interval; hdr->time_sec = io_u->start_time.tv_sec; - hdr->time_usec = io_u->start_time.tv_usec; + hdr->time_usec = io_u->start_time.tv_nsec / 1000; hdr->thread = td->thread_number; hdr->numberio = io_u->numberio; hdr->crc32 = fio_crc32c(p, offsetof(struct verify_header, crc32)); @@ -1033,9 +1188,10 @@ unsigned int header_len) { unsigned int data_len; - void *data, *p; + void *data; + char *p; - p = (void *) hdr; + p = (char *) hdr; fill_hdr(td, io_u, hdr, header_num, header_len, io_u->rand_seed); @@ -1084,6 +1240,26 @@ io_u, hdr->len); fill_sha512(hdr, data, data_len); break; + case VERIFY_SHA3_224: + dprint(FD_VERIFY, "fill sha3-224 io_u %p, len %u\n", + io_u, hdr->len); + fill_sha3_224(hdr, data, data_len); + break; + case VERIFY_SHA3_256: + dprint(FD_VERIFY, "fill sha3-256 io_u %p, len %u\n", + io_u, hdr->len); + fill_sha3_256(hdr, data, data_len); + break; + case VERIFY_SHA3_384: + dprint(FD_VERIFY, "fill sha3-384 io_u %p, len %u\n", + io_u, hdr->len); + fill_sha3_384(hdr, data, data_len); + break; + case VERIFY_SHA3_512: + dprint(FD_VERIFY, "fill sha3-512 io_u %p, len %u\n", + io_u, hdr->len); + fill_sha3_512(hdr, data, data_len); + break; case VERIFY_XXHASH: dprint(FD_VERIFY, "fill xxhash io_u %p, len %u\n", io_u, hdr->len); @@ -1211,6 +1387,7 @@ { if (td->o.verify == VERIFY_CRC32C_INTEL || td->o.verify == VERIFY_CRC32C) { + crc32c_arm64_probe(); crc32c_intel_probe(); } } diff -Nru fio-2.16/verify.h fio-3.1/verify.h --- fio-2.16/verify.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/verify.h 2017-09-28 10:23:20.000000000 +0000 @@ -20,6 +20,10 @@ VERIFY_CRC7, /* crc7 sum data blocks */ VERIFY_SHA256, /* sha256 sum data blocks */ VERIFY_SHA512, /* sha512 sum data blocks */ + VERIFY_SHA3_224, /* sha3-224 sum data blocks */ + VERIFY_SHA3_256, /* sha3-256 sum data blocks */ + VERIFY_SHA3_384, /* sha3-384 sum data blocks */ + VERIFY_SHA3_512, /* sha3-512 sum data blocks */ VERIFY_XXHASH, /* xxhash sum data blocks */ VERIFY_SHA1, /* sha1 sum data blocks */ VERIFY_PATTERN, /* verify specific patterns */ @@ -48,6 +52,18 @@ struct vhdr_md5 { uint32_t md5_digest[4]; }; +struct vhdr_sha3_224 { + uint8_t sha[224 / 8]; +}; +struct vhdr_sha3_256 { + uint8_t sha[256 / 8]; +}; +struct vhdr_sha3_384 { + uint8_t sha[384 / 8]; +}; +struct vhdr_sha3_512 { + uint8_t sha[512 / 8]; +}; struct vhdr_sha512 { uint8_t sha512[128]; }; diff -Nru fio-2.16/verify-state.h fio-3.1/verify-state.h --- fio-2.16/verify-state.h 2016-12-20 06:12:56.000000000 +0000 +++ fio-3.1/verify-state.h 2017-09-28 10:23:20.000000000 +0000 @@ -77,7 +77,7 @@ static inline struct thread_io_list *io_list_next(struct thread_io_list *s) { - return (void *) s + thread_io_list_sz(s); + return (struct thread_io_list *)((char *) s + thread_io_list_sz(s)); } static inline void verify_state_gen_name(char *out, size_t size,