diff -Nru fio-2.16/appveyor.yml fio-3.1/appveyor.yml
--- fio-2.16/appveyor.yml	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/appveyor.yml	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,27 @@
+clone_depth: 50
+environment:
+  MAKEFLAGS: -j 2
+  matrix:
+    - platform: x86_64
+      BUILD_ARCH: x64
+      CYG_ROOT: C:\cygwin64
+      CONFIGURE_OPTIONS:
+    - platform: x86
+      BUILD_ARCH: x86
+      CYG_ROOT: C:\cygwin
+      CONFIGURE_OPTIONS: --build-32bit-win
+
+build_script:
+  - SET PATH=%CYG_ROOT%\bin;%PATH%
+  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && ./configure --extra-cflags=\"-Werror\" ${CONFIGURE_OPTIONS} && make.exe'
+
+after_build:
+  - cd os\windows && dobuild.cmd %BUILD_ARCH%
+
+test_script:
+  - SET PATH=%CYG_ROOT%\bin;%PATH%
+  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && file.exe fio.exe && make.exe test'
+
+artifacts:
+  - path: os\windows\*.msi
+    name: msi
diff -Nru fio-2.16/arch/arch-aarch64.h fio-3.1/arch/arch-aarch64.h
--- fio-2.16/arch/arch-aarch64.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/arch/arch-aarch64.h	2017-09-28 10:23:20.000000000 +0000
@@ -27,4 +27,8 @@
 
 #define ARCH_HAVE_FFZ
 
+#ifdef ARCH_HAVE_CRC_CRYPTO
+#define ARCH_HAVE_ARM64_CRC_CRYPTO
+#endif
+
 #endif
diff -Nru fio-2.16/arch/arch-arm.h fio-3.1/arch/arch-arm.h
--- fio-2.16/arch/arch-arm.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/arch/arch-arm.h	2017-09-28 10:23:20.000000000 +0000
@@ -14,6 +14,8 @@
 #define	nop		__asm__ __volatile__ ("nop")
 #define read_barrier()	__sync_synchronize()
 #define write_barrier()	__sync_synchronize()
+#else
+#error "unsupported ARM architecture"
 #endif
 
 #endif
diff -Nru fio-2.16/arch/arch.h fio-3.1/arch/arch.h
--- fio-2.16/arch/arch.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/arch/arch.h	2017-09-28 10:23:20.000000000 +0000
@@ -1,6 +1,8 @@
 #ifndef ARCH_H
 #define ARCH_H
 
+#include "../lib/types.h"
+
 enum {
 	arch_x86_64 = 1,
 	arch_x86,
diff -Nru fio-2.16/arch/arch-ia64.h fio-3.1/arch/arch-ia64.h
--- fio-2.16/arch/arch-ia64.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/arch/arch-ia64.h	2017-09-28 10:23:20.000000000 +0000
@@ -28,10 +28,10 @@
 }
 
 #define ARCH_HAVE_INIT
-extern int tsc_reliable;
+extern bool tsc_reliable;
 static inline int arch_init(char *envp[])
 {
-	tsc_reliable = 1;
+	tsc_reliable = true;
 	return 0;
 }
 
diff -Nru fio-2.16/arch/arch-ppc.h fio-3.1/arch/arch-ppc.h
--- fio-2.16/arch/arch-ppc.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/arch/arch-ppc.h	2017-09-28 10:23:20.000000000 +0000
@@ -62,7 +62,8 @@
 		"	cmpwi %0,0;\n"
 		"	beq-  90b;\n"
 	: "=r" (rval)
-	: "i" (SPRN_TBRL));
+	: "i" (SPRN_TBRL)
+	: "cr0");
 
 	return rval;
 }
@@ -117,12 +118,12 @@
 #endif
 
 #define ARCH_HAVE_INIT
-extern int tsc_reliable;
+extern bool tsc_reliable;
 
 static inline int arch_init(char *envp[])
 {
 #if 0
-	tsc_reliable = 1;
+	tsc_reliable = true;
 	atb_clocktest();
 #endif
 	return 0;
diff -Nru fio-2.16/arch/arch-s390.h fio-3.1/arch/arch-s390.h
--- fio-2.16/arch/arch-s390.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/arch/arch-s390.h	2017-09-28 10:23:20.000000000 +0000
@@ -28,10 +28,10 @@
 #undef ARCH_CPU_CLOCK_WRAPS
 
 #define ARCH_HAVE_INIT
-extern int tsc_reliable;
+extern bool tsc_reliable;
 static inline int arch_init(char *envp[])
 {
-	tsc_reliable = 1;
+	tsc_reliable = true;
 	return 0;
 }
 
diff -Nru fio-2.16/arch/arch-x86-common.h fio-3.1/arch/arch-x86-common.h
--- fio-2.16/arch/arch-x86-common.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/arch/arch-x86-common.h	2017-09-28 10:23:20.000000000 +0000
@@ -14,7 +14,7 @@
 
 #define ARCH_HAVE_INIT
 
-extern int tsc_reliable;
+extern bool tsc_reliable;
 extern int arch_random;
 
 static inline void arch_init_intel(unsigned int level)
diff -Nru fio-2.16/backend.c fio-3.1/backend.c
--- fio-2.16/backend.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/backend.c	2017-09-28 10:23:20.000000000 +0000
@@ -76,9 +76,6 @@
 int temp_stall_ts;
 unsigned long done_secs = 0;
 
-#define PAGE_ALIGN(buf)	\
-	(char *) (((uintptr_t) (buf) + page_mask) & ~page_mask)
-
 #define JOB_START_TIMEOUT	(5 * 1000)
 
 static void sig_int(int sig)
@@ -139,7 +136,7 @@
 /*
  * Check if we are above the minimum rate given.
  */
-static bool __check_min_rate(struct thread_data *td, struct timeval *now,
+static bool __check_min_rate(struct thread_data *td, struct timespec *now,
 			     enum fio_ddir ddir)
 {
 	unsigned long long bytes = 0;
@@ -180,8 +177,8 @@
 			 * check bandwidth specified rate
 			 */
 			if (bytes < td->rate_bytes[ddir]) {
-				log_err("%s: min rate %u not met\n", td->o.name,
-								ratemin);
+				log_err("%s: rate_min=%uB/s not met, only transferred %lluB\n",
+					td->o.name, ratemin, bytes);
 				return true;
 			} else {
 				if (spent)
@@ -191,9 +188,8 @@
 
 				if (rate < ratemin ||
 				    bytes < td->rate_bytes[ddir]) {
-					log_err("%s: min rate %u not met, got"
-						" %luKB/sec\n", td->o.name,
-							ratemin, rate);
+					log_err("%s: rate_min=%uB/s not met, got %luB/s\n",
+						td->o.name, ratemin, rate);
 					return true;
 				}
 			}
@@ -202,8 +198,8 @@
 			 * checks iops specified rate
 			 */
 			if (iops < rate_iops) {
-				log_err("%s: min iops rate %u not met\n",
-						td->o.name, rate_iops);
+				log_err("%s: rate_iops_min=%u not met, only performed %lu IOs\n",
+						td->o.name, rate_iops, iops);
 				return true;
 			} else {
 				if (spent)
@@ -213,9 +209,8 @@
 
 				if (rate < rate_iops_min ||
 				    iops < td->rate_blocks[ddir]) {
-					log_err("%s: min iops rate %u not met,"
-						" got %lu\n", td->o.name,
-							rate_iops_min, rate);
+					log_err("%s: rate_iops_min=%u not met, got %lu IOPS\n",
+						td->o.name, rate_iops_min, rate);
 					return true;
 				}
 			}
@@ -228,7 +223,7 @@
 	return false;
 }
 
-static bool check_min_rate(struct thread_data *td, struct timeval *now)
+static bool check_min_rate(struct thread_data *td, struct timespec *now)
 {
 	bool ret = false;
 
@@ -340,18 +335,18 @@
 	return ret;
 }
 
-static inline void __update_tv_cache(struct thread_data *td)
+static inline void __update_ts_cache(struct thread_data *td)
 {
-	fio_gettime(&td->tv_cache, NULL);
+	fio_gettime(&td->ts_cache, NULL);
 }
 
-static inline void update_tv_cache(struct thread_data *td)
+static inline void update_ts_cache(struct thread_data *td)
 {
-	if ((++td->tv_cache_nr & td->tv_cache_mask) == td->tv_cache_mask)
-		__update_tv_cache(td);
+	if ((++td->ts_cache_nr & td->ts_cache_mask) == td->ts_cache_mask)
+		__update_ts_cache(td);
 }
 
-static inline bool runtime_exceeded(struct thread_data *td, struct timeval *t)
+static inline bool runtime_exceeded(struct thread_data *td, struct timespec *t)
 {
 	if (in_ramp_time(td))
 		return false;
@@ -435,7 +430,7 @@
 	}
 }
 
-static int wait_for_completions(struct thread_data *td, struct timeval *time)
+static int wait_for_completions(struct thread_data *td, struct timespec *time)
 {
 	const int full = queue_full(td);
 	int min_evts = 0;
@@ -467,7 +462,7 @@
 
 int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
 		   enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify,
-		   struct timeval *comp_time)
+		   struct timespec *comp_time)
 {
 	int ret2;
 
@@ -592,6 +587,50 @@
 }
 
 /*
+ * Check if io_u will overlap an in-flight IO in the queue
+ */
+static bool in_flight_overlap(struct io_u_queue *q, struct io_u *io_u)
+{
+	bool overlap;
+	struct io_u *check_io_u;
+	unsigned long long x1, x2, y1, y2;
+	int i;
+
+	x1 = io_u->offset;
+	x2 = io_u->offset + io_u->buflen;
+	overlap = false;
+	io_u_qiter(q, check_io_u, i) {
+		if (check_io_u->flags & IO_U_F_FLIGHT) {
+			y1 = check_io_u->offset;
+			y2 = check_io_u->offset + check_io_u->buflen;
+
+			if (x1 < y2 && y1 < x2) {
+				overlap = true;
+				dprint(FD_IO, "in-flight overlap: %llu/%lu, %llu/%lu\n",
+						x1, io_u->buflen,
+						y1, check_io_u->buflen);
+				break;
+			}
+		}
+	}
+
+	return overlap;
+}
+
+static int io_u_submit(struct thread_data *td, struct io_u *io_u)
+{
+	/*
+	 * Check for overlap if the user asked us to, and we have
+	 * at least one IO in flight besides this one.
+	 */
+	if (td->o.serialize_overlap && td->cur_depth > 1 &&
+	    in_flight_overlap(&td->io_u_all, io_u))
+		return FIO_Q_BUSY;
+
+	return td_io_queue(td, io_u);
+}
+
+/*
  * The main verify engine. Runs over the writes we previously submitted,
  * reads the blocks back in, and checks the crc/md5 of the data.
  */
@@ -638,12 +677,12 @@
 		enum fio_ddir ddir;
 		int full;
 
-		update_tv_cache(td);
+		update_ts_cache(td);
 		check_update_rusage(td);
 
-		if (runtime_exceeded(td, &td->tv_cache)) {
-			__update_tv_cache(td);
-			if (runtime_exceeded(td, &td->tv_cache)) {
+		if (runtime_exceeded(td, &td->ts_cache)) {
+			__update_ts_cache(td);
+			if (runtime_exceeded(td, &td->ts_cache)) {
 				fio_mark_td_terminate(td);
 				break;
 			}
@@ -721,7 +760,7 @@
 		if (!td->o.disable_slat)
 			fio_gettime(&io_u->start_time, NULL);
 
-		ret = td_io_queue(td, io_u);
+		ret = io_u_submit(td, io_u);
 
 		if (io_queue_event(td, io_u, &ret, ddir, NULL, 1, NULL))
 			break;
@@ -781,8 +820,8 @@
 	else
 		bytes = this_bytes[DDIR_TRIM];
 
-	if (td->o.io_limit)
-		limit = td->o.io_limit;
+	if (td->o.io_size)
+		limit = td->o.io_size;
 	else
 		limit = td->o.size;
 
@@ -816,13 +855,14 @@
 		uint64_t val;
 		iops = bps / td->o.bs[ddir];
 		val = (int64_t) (1000000 / iops) *
-				-logf(__rand_0_1(&td->poisson_state));
+				-logf(__rand_0_1(&td->poisson_state[ddir]));
 		if (val) {
-			dprint(FD_RATE, "poisson rate iops=%llu\n",
-					(unsigned long long) 1000000 / val);
+			dprint(FD_RATE, "poisson rate iops=%llu, ddir=%d\n",
+					(unsigned long long) 1000000 / val,
+					ddir);
 		}
-		td->last_usec += val;
-		return td->last_usec;
+		td->last_usec[ddir] += val;
+		return td->last_usec[ddir];
 	} else if (bps) {
 		secs = bytes / bps;
 		remainder = bytes % bps;
@@ -856,11 +896,11 @@
 
 	total_bytes = td->o.size;
 	/*
-	* Allow random overwrite workloads to write up to io_limit
+	* Allow random overwrite workloads to write up to io_size
 	* before starting verification phase as 'size' doesn't apply.
 	*/
 	if (td_write(td) && td_random(td) && td->o.norandommap)
-		total_bytes = max(total_bytes, (uint64_t) td->o.io_limit);
+		total_bytes = max(total_bytes, (uint64_t) td->o.io_size);
 	/*
 	 * If verify_backlog is enabled, we'll run the verify in this
 	 * handler as well. For that case, we may need up to twice the
@@ -878,7 +918,7 @@
 	while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
 		(!flist_empty(&td->trim_list)) || !io_issue_bytes_exceeded(td) ||
 		td->o.time_based) {
-		struct timeval comp_time;
+		struct timespec comp_time;
 		struct io_u *io_u;
 		int full;
 		enum fio_ddir ddir;
@@ -888,11 +928,11 @@
 		if (td->terminate || td->done)
 			break;
 
-		update_tv_cache(td);
+		update_ts_cache(td);
 
-		if (runtime_exceeded(td, &td->tv_cache)) {
-			__update_tv_cache(td);
-			if (runtime_exceeded(td, &td->tv_cache)) {
+		if (runtime_exceeded(td, &td->ts_cache)) {
+			__update_ts_cache(td);
+			if (runtime_exceeded(td, &td->ts_cache)) {
 				fio_mark_td_terminate(td);
 				break;
 			}
@@ -987,7 +1027,7 @@
 				td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
 
 		} else {
-			ret = td_io_queue(td, io_u);
+			ret = io_u_submit(td, io_u);
 
 			if (should_check_rate(td))
 				td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
@@ -1200,7 +1240,7 @@
 
 	if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
 	    td_ioengine_flagged(td, FIO_RAWIO))
-		p = PAGE_ALIGN(td->orig_buffer) + td->o.mem_align;
+		p = PTR_ALIGN(td->orig_buffer, page_mask) + td->o.mem_align;
 	else
 		p = td->orig_buffer;
 
@@ -1266,6 +1306,10 @@
 	return 0;
 }
 
+/*
+ * This function is Linux specific.
+ * FIO_HAVE_IOSCHED_SWITCH enabled currently means it's Linux.
+ */
 static int switch_ioscheduler(struct thread_data *td)
 {
 #ifdef FIO_HAVE_IOSCHED_SWITCH
@@ -1276,7 +1320,8 @@
 	if (td_ioengine_flagged(td, FIO_DISKLESSIO))
 		return 0;
 
-	sprintf(tmp, "%s/queue/scheduler", td->sysfs_root);
+	assert(td->files && td->files[0]);
+	sprintf(tmp, "%s/queue/scheduler", td->files[0]->du->sysfs_root);
 
 	f = fopen(tmp, "r+");
 	if (!f) {
@@ -1346,6 +1391,8 @@
 
 	if (td->done)
 		return false;
+	if (td->terminate)
+		return false;
 	if (td->o.time_based)
 		return true;
 	if (td->o.loops) {
@@ -1355,8 +1402,8 @@
 	if (exceeds_number_ios(td))
 		return false;
 
-	if (td->o.io_limit)
-		limit = td->o.io_limit;
+	if (td->o.io_size)
+		limit = td->o.io_size;
 	else
 		limit = td->o.size;
 
@@ -1364,14 +1411,14 @@
 		uint64_t diff;
 
 		/*
-		 * If the difference is less than the minimum IO size, we
+		 * If the difference is less than the maximum IO size, we
 		 * are done.
 		 */
 		diff = limit - ddir_rw_sum(td->io_bytes);
 		if (diff < td_max_bs(td))
 			return false;
 
-		if (fio_files_done(td) && !td->o.io_limit)
+		if (fio_files_done(td) && !td->o.io_size)
 			return false;
 
 		return true;
@@ -1456,6 +1503,7 @@
 	struct thread_data *td = fd->td;
 	struct thread_options *o = &td->o;
 	struct sk_out *sk_out = fd->sk_out;
+	uint64_t bytes_done[DDIR_RWDIR_CNT];
 	int deadlock_loop_cnt;
 	int clear_state;
 	int ret;
@@ -1677,12 +1725,14 @@
 					sizeof(td->bw_sample_time));
 	}
 
+	memset(bytes_done, 0, sizeof(bytes_done));
 	clear_state = 0;
+
 	while (keep_running(td)) {
 		uint64_t verify_bytes;
 
 		fio_gettime(&td->start, NULL);
-		memcpy(&td->tv_cache, &td->start, sizeof(td->start));
+		memcpy(&td->ts_cache, &td->start, sizeof(td->start));
 
 		if (clear_state) {
 			clear_io_state(td, 0);
@@ -1693,11 +1743,9 @@
 
 		prune_io_piece_log(td);
 
-		if (td->o.verify_only && (td_write(td) || td_rw(td)))
+		if (td->o.verify_only && td_write(td))
 			verify_bytes = do_dry_run(td);
 		else {
-			uint64_t bytes_done[DDIR_RWDIR_CNT];
-
 			do_io(td, bytes_done);
 
 			if (!ddir_rw_sum(bytes_done)) {
@@ -1776,6 +1824,18 @@
 			break;
 	}
 
+	/*
+	 * If td ended up with no I/O when it should have had,
+	 * then something went wrong unless FIO_NOIO or FIO_DISKLESSIO.
+	 * (Are we not missing other flags that can be ignored ?)
+	 */
+	if ((td->o.size || td->o.io_size) && !ddir_rw_sum(bytes_done) &&
+	    !(td_ioengine_flagged(td, FIO_NOIO) ||
+	      td_ioengine_flagged(td, FIO_DISKLESSIO)))
+		log_err("%s: No I/O performed by %s, "
+			 "perhaps try --debug=io option for details?\n",
+			 td->o.name, td->io_ops->name);
+
 	td_set_runstate(td, TD_FINISHING);
 
 	update_rusage_stat(td);
@@ -1836,9 +1896,6 @@
 	if (o->write_iolog_file)
 		write_iolog_close(td);
 
-	fio_mutex_remove(td->mutex);
-	td->mutex = NULL;
-
 	td_set_runstate(td, TD_EXITED);
 
 	/*
@@ -1851,14 +1908,6 @@
 	return (void *) (uintptr_t) td->error;
 }
 
-static void dump_td_info(struct thread_data *td)
-{
-	log_err("fio: job '%s' (state=%d) hasn't exited in %lu seconds, it "
-		"appears to be stuck. Doing forceful exit of this job.\n",
-			td->o.name, td->runstate,
-			(unsigned long) time_since_now(&td->terminate_time));
-}
-
 /*
  * Run over the job map and reap the threads that have exited, if any.
  */
@@ -1943,7 +1992,11 @@
 		if (td->terminate &&
 		    td->runstate < TD_FSYNCING &&
 		    time_since_now(&td->terminate_time) >= FIO_REAP_TIMEOUT) {
-			dump_td_info(td);
+			log_err("fio: job '%s' (state=%d) hasn't exited in "
+				"%lu seconds, it appears to be stuck. Doing "
+				"forceful exit of this job.\n",
+				td->o.name, td->runstate,
+				(unsigned long) time_since_now(&td->terminate_time));
 			td_set_runstate(td, TD_REAPED);
 			goto reaped;
 		}
@@ -1991,7 +2044,10 @@
 static bool trigger_timedout(void)
 {
 	if (trigger_timeout)
-		return time_since_genesis() >= trigger_timeout;
+		if (time_since_genesis() >= trigger_timeout) {
+			trigger_timeout = 0;
+			return true;
+		}
 
 	return false;
 }
@@ -2000,7 +2056,7 @@
 {
 	int ret;
 
-	if (!cmd)
+	if (!cmd || cmd[0] == '\0')
 		return;
 
 	ret = system(cmd);
@@ -2056,8 +2112,16 @@
 	if (!td_write(td) || td->o.allow_mounted_write)
 		return false;
 
+	/*
+	 * If FIO_HAVE_CHARDEV_SIZE is defined, it's likely that chrdevs
+	 * are mkfs'd and mounted.
+	 */
 	for_each_file(td, f, i) {
-		if (f->filetype != FIO_TYPE_BD)
+#ifdef FIO_HAVE_CHARDEV_SIZE
+		if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR)
+#else
+		if (f->filetype != FIO_TYPE_BLOCK)
+#endif
 			continue;
 		if (device_is_mounted(f->file_name))
 			goto mounted;
@@ -2065,7 +2129,7 @@
 
 	return false;
 mounted:
-	log_err("fio: %s appears mounted, and 'allow_mounted_write' isn't set. Aborting.", f->file_name);
+	log_err("fio: %s appears mounted, and 'allow_mounted_write' isn't set. Aborting.\n", f->file_name);
 	return true;
 }
 
@@ -2187,7 +2251,7 @@
 
 	while (todo) {
 		struct thread_data *map[REAL_MAX_JOBS];
-		struct timeval this_start;
+		struct timespec this_start;
 		int this_jobs = 0, left;
 		struct fork_data *fd;
 
@@ -2427,6 +2491,8 @@
 			fio_mutex_remove(td->rusage_sem);
 			td->rusage_sem = NULL;
 		}
+		fio_mutex_remove(td->mutex);
+		td->mutex = NULL;
 	}
 
 	free_disk_util();
diff -Nru fio-2.16/blktrace_api.h fio-3.1/blktrace_api.h
--- fio-2.16/blktrace_api.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/blktrace_api.h	2017-09-28 10:23:20.000000000 +0000
@@ -127,9 +127,4 @@
 	__u32 pid;
 };
 
-#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
-#define BLKTRACESTART _IO(0x12,116)
-#define BLKTRACESTOP _IO(0x12,117)
-#define BLKTRACETEARDOWN _IO(0x12,118)
-
 #endif
diff -Nru fio-2.16/blktrace.c fio-3.1/blktrace.c
--- fio-2.16/blktrace.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/blktrace.c	2017-09-28 10:23:20.000000000 +0000
@@ -10,6 +10,7 @@
 
 #include "flist.h"
 #include "fio.h"
+#include "blktrace.h"
 #include "blktrace_api.h"
 #include "oslib/linux-dev-lookup.h"
 
diff -Nru fio-2.16/blktrace.h fio-3.1/blktrace.h
--- fio-2.16/blktrace.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/blktrace.h	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,23 @@
+#ifndef FIO_BLKTRACE_H
+#define FIO_BLKTRACE_H
+
+#ifdef FIO_HAVE_BLKTRACE
+
+int is_blktrace(const char *, int *);
+int load_blktrace(struct thread_data *, const char *, int);
+
+#else
+
+static inline int is_blktrace(const char *fname, int *need_swap)
+{
+	return 0;
+}
+
+static inline int load_blktrace(struct thread_data *td, const char *fname,
+				int need_swap)
+{
+	return 1;
+}
+
+#endif
+#endif
diff -Nru fio-2.16/cconv.c fio-3.1/cconv.c
--- fio-2.16/cconv.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/cconv.c	2017-09-28 10:23:20.000000000 +0000
@@ -88,7 +88,7 @@
 	o->td_ddir = le32_to_cpu(top->td_ddir);
 	o->rw_seq = le32_to_cpu(top->rw_seq);
 	o->kb_base = le32_to_cpu(top->kb_base);
-	o->unit_base = le32_to_cpu(top->kb_base);
+	o->unit_base = le32_to_cpu(top->unit_base);
 	o->ddir_seq_nr = le32_to_cpu(top->ddir_seq_nr);
 	o->ddir_seq_add = le64_to_cpu(top->ddir_seq_add);
 	o->iodepth = le32_to_cpu(top->iodepth);
@@ -96,14 +96,16 @@
 	o->iodepth_batch = le32_to_cpu(top->iodepth_batch);
 	o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min);
 	o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max);
+	o->serialize_overlap = le32_to_cpu(top->serialize_overlap);
 	o->size = le64_to_cpu(top->size);
-	o->io_limit = le64_to_cpu(top->io_limit);
+	o->io_size = le64_to_cpu(top->io_size);
 	o->size_percent = le32_to_cpu(top->size_percent);
 	o->fill_device = le32_to_cpu(top->fill_device);
 	o->file_append = le32_to_cpu(top->file_append);
 	o->file_size_low = le64_to_cpu(top->file_size_low);
 	o->file_size_high = le64_to_cpu(top->file_size_high);
 	o->start_offset = le64_to_cpu(top->start_offset);
+	o->start_offset_percent = le32_to_cpu(top->start_offset_percent);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		o->bs[i] = le32_to_cpu(top->bs[i]);
@@ -155,6 +157,7 @@
 	o->end_fsync = le32_to_cpu(top->end_fsync);
 	o->pre_read = le32_to_cpu(top->pre_read);
 	o->sync_io = le32_to_cpu(top->sync_io);
+	o->write_hint = le32_to_cpu(top->write_hint);
 	o->verify = le32_to_cpu(top->verify);
 	o->do_verify = le32_to_cpu(top->do_verify);
 	o->verifysort = le32_to_cpu(top->verifysort);
@@ -235,6 +238,7 @@
 	o->new_group = le32_to_cpu(top->new_group);
 	o->numjobs = le32_to_cpu(top->numjobs);
 	o->cpus_allowed_policy = le32_to_cpu(top->cpus_allowed_policy);
+	o->gpu_dev_id = le32_to_cpu(top->gpu_dev_id);
 	o->iolog = le32_to_cpu(top->iolog);
 	o->rwmixcycle = le32_to_cpu(top->rwmixcycle);
 	o->nice = le32_to_cpu(top->nice);
@@ -242,6 +246,7 @@
 	o->ioprio_class = le32_to_cpu(top->ioprio_class);
 	o->file_service_type = le32_to_cpu(top->file_service_type);
 	o->group_reporting = le32_to_cpu(top->group_reporting);
+	o->stats = le32_to_cpu(top->stats);
 	o->fadvise_hint = le32_to_cpu(top->fadvise_hint);
 	o->fallocate_mode = le32_to_cpu(top->fallocate_mode);
 	o->zero_buffers = le32_to_cpu(top->zero_buffers);
@@ -262,6 +267,7 @@
 	o->trim_batch = le32_to_cpu(top->trim_batch);
 	o->trim_zero = le32_to_cpu(top->trim_zero);
 	o->clat_percentiles = le32_to_cpu(top->clat_percentiles);
+	o->lat_percentiles = le32_to_cpu(top->lat_percentiles);
 	o->percentile_precision = le32_to_cpu(top->percentile_precision);
 	o->continue_on_error = le32_to_cpu(top->continue_on_error);
 	o->cgroup_weight = le32_to_cpu(top->cgroup_weight);
@@ -279,7 +285,6 @@
 	o->compress_percentage = le32_to_cpu(top->compress_percentage);
 	o->compress_chunk = le32_to_cpu(top->compress_chunk);
 	o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
-	o->skip_bad = le32_to_cpu(top->skip_bad);
 	o->block_error_hist = le32_to_cpu(top->block_error_hist);
 	o->replay_align = le32_to_cpu(top->replay_align);
 	o->replay_scale = le32_to_cpu(top->replay_scale);
@@ -336,13 +341,14 @@
 	top->td_ddir = cpu_to_le32(o->td_ddir);
 	top->rw_seq = cpu_to_le32(o->rw_seq);
 	top->kb_base = cpu_to_le32(o->kb_base);
-	top->unit_base = cpu_to_le32(o->kb_base);
+	top->unit_base = cpu_to_le32(o->unit_base);
 	top->ddir_seq_nr = cpu_to_le32(o->ddir_seq_nr);
 	top->iodepth = cpu_to_le32(o->iodepth);
 	top->iodepth_low = cpu_to_le32(o->iodepth_low);
 	top->iodepth_batch = cpu_to_le32(o->iodepth_batch);
 	top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min);
 	top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max);
+	top->serialize_overlap = cpu_to_le32(o->serialize_overlap);
 	top->size_percent = cpu_to_le32(o->size_percent);
 	top->fill_device = cpu_to_le32(o->fill_device);
 	top->file_append = cpu_to_le32(o->file_append);
@@ -362,6 +368,7 @@
 	top->end_fsync = cpu_to_le32(o->end_fsync);
 	top->pre_read = cpu_to_le32(o->pre_read);
 	top->sync_io = cpu_to_le32(o->sync_io);
+	top->write_hint = cpu_to_le32(o->write_hint);
 	top->verify = cpu_to_le32(o->verify);
 	top->do_verify = cpu_to_le32(o->do_verify);
 	top->verifysort = cpu_to_le32(o->verifysort);
@@ -419,6 +426,7 @@
 	top->new_group = cpu_to_le32(o->new_group);
 	top->numjobs = cpu_to_le32(o->numjobs);
 	top->cpus_allowed_policy = cpu_to_le32(o->cpus_allowed_policy);
+	top->gpu_dev_id = cpu_to_le32(o->gpu_dev_id);
 	top->iolog = cpu_to_le32(o->iolog);
 	top->rwmixcycle = cpu_to_le32(o->rwmixcycle);
 	top->nice = cpu_to_le32(o->nice);
@@ -426,6 +434,7 @@
 	top->ioprio_class = cpu_to_le32(o->ioprio_class);
 	top->file_service_type = cpu_to_le32(o->file_service_type);
 	top->group_reporting = cpu_to_le32(o->group_reporting);
+	top->stats = cpu_to_le32(o->stats);
 	top->fadvise_hint = cpu_to_le32(o->fadvise_hint);
 	top->fallocate_mode = cpu_to_le32(o->fallocate_mode);
 	top->zero_buffers = cpu_to_le32(o->zero_buffers);
@@ -446,6 +455,7 @@
 	top->trim_batch = cpu_to_le32(o->trim_batch);
 	top->trim_zero = cpu_to_le32(o->trim_zero);
 	top->clat_percentiles = cpu_to_le32(o->clat_percentiles);
+	top->lat_percentiles = cpu_to_le32(o->lat_percentiles);
 	top->percentile_precision = cpu_to_le32(o->percentile_precision);
 	top->continue_on_error = cpu_to_le32(o->continue_on_error);
 	top->cgroup_weight = cpu_to_le32(o->cgroup_weight);
@@ -464,7 +474,6 @@
 	top->compress_chunk = cpu_to_le32(o->compress_chunk);
 	top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
 	top->block_error_hist = cpu_to_le32(o->block_error_hist);
-	top->skip_bad = cpu_to_le32(o->skip_bad);
 	top->replay_align = cpu_to_le32(o->replay_align);
 	top->replay_scale = cpu_to_le32(o->replay_scale);
 	top->per_job_logs = cpu_to_le32(o->per_job_logs);
@@ -521,7 +530,7 @@
 	memcpy(top->buffer_pattern, o->buffer_pattern, MAX_PATTERN_SIZE);
 
 	top->size = __cpu_to_le64(o->size);
-	top->io_limit = __cpu_to_le64(o->io_limit);
+	top->io_size = __cpu_to_le64(o->io_size);
 	top->verify_backlog = __cpu_to_le64(o->verify_backlog);
 	top->start_delay = __cpu_to_le64(o->start_delay);
 	top->start_delay_high = __cpu_to_le64(o->start_delay_high);
@@ -539,6 +548,7 @@
 	top->file_size_low = __cpu_to_le64(o->file_size_low);
 	top->file_size_high = __cpu_to_le64(o->file_size_high);
 	top->start_offset = __cpu_to_le64(o->start_offset);
+	top->start_offset_percent = __cpu_to_le32(o->start_offset_percent);
 	top->trim_backlog = __cpu_to_le64(o->trim_backlog);
 	top->offset_increment = __cpu_to_le64(o->offset_increment);
 	top->number_ios = __cpu_to_le64(o->number_ios);
diff -Nru fio-2.16/client.c fio-3.1/client.c
--- fio-2.16/client.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/client.c	2017-09-28 10:23:20.000000000 +0000
@@ -48,7 +48,7 @@
 	.client_type	= FIO_CLIENT_TYPE_CLI,
 };
 
-static struct timeval eta_tv;
+static struct timespec eta_ts;
 
 static FLIST_HEAD(client_list);
 static FLIST_HEAD(eta_list);
@@ -318,7 +318,7 @@
 	client->hostname = strdup(hostname);
 
 	if (type == Fio_client_socket)
-		client->is_sock = 1;
+		client->is_sock = true;
 	else {
 		int ipv6;
 
@@ -728,7 +728,7 @@
 	strcpy((char *) pdu->file, filename);
 	pdu->client_type = cpu_to_le16((uint16_t) client->type);
 
-	client->sent_job = 1;
+	client->sent_job = true;
 	ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_LOAD_FILE, pdu, p_size,NULL, NULL);
 	free(pdu);
 	return ret;
@@ -781,7 +781,7 @@
 	pdu->buf_len = __cpu_to_le32(sb.st_size);
 	pdu->client_type = cpu_to_le32(client->type);
 
-	client->sent_job = 1;
+	client->sent_job = true;
 	ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_JOB, pdu, p_size, NULL, NULL);
 	free(pdu);
 	close(fd);
@@ -799,7 +799,7 @@
 		ret = __fio_client_send_remote_ini(client, filename);
 
 	if (!ret)
-		client->sent_job = 1;
+		client->sent_job = true;
 
 	return ret;
 }
@@ -885,6 +885,7 @@
 		convert_io_stat(&dst->slat_stat[i], &src->slat_stat[i]);
 		convert_io_stat(&dst->lat_stat[i], &src->lat_stat[i]);
 		convert_io_stat(&dst->bw_stat[i], &src->bw_stat[i]);
+		convert_io_stat(&dst->iops_stat[i], &src->iops_stat[i]);
 	}
 
 	dst->usr_time		= le64_to_cpu(src->usr_time);
@@ -892,7 +893,8 @@
 	dst->ctx		= le64_to_cpu(src->ctx);
 	dst->minf		= le64_to_cpu(src->minf);
 	dst->majf		= le64_to_cpu(src->majf);
-	dst->clat_percentiles	= le64_to_cpu(src->clat_percentiles);
+	dst->clat_percentiles	= le32_to_cpu(src->clat_percentiles);
+	dst->lat_percentiles	= le32_to_cpu(src->lat_percentiles);
 	dst->percentile_precision = le64_to_cpu(src->percentile_precision);
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
@@ -908,10 +910,12 @@
 		dst->io_u_complete[i]	= le32_to_cpu(src->io_u_complete[i]);
 	}
 
-	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		dst->io_u_lat_n[i]	= le32_to_cpu(src->io_u_lat_n[i]);
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
 		dst->io_u_lat_u[i]	= le32_to_cpu(src->io_u_lat_u[i]);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
 		dst->io_u_lat_m[i]	= le32_to_cpu(src->io_u_lat_m[i]);
-	}
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++)
 		for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
@@ -972,7 +976,7 @@
 		dst->min_run[i]		= le64_to_cpu(src->min_run[i]);
 		dst->max_bw[i]		= le64_to_cpu(src->max_bw[i]);
 		dst->min_bw[i]		= le64_to_cpu(src->min_bw[i]);
-		dst->io_kb[i]		= le64_to_cpu(src->io_kb[i]);
+		dst->iobytes[i]		= le64_to_cpu(src->iobytes[i]);
 		dst->agg[i]		= le64_to_cpu(src->agg[i]);
 	}
 
@@ -1001,7 +1005,7 @@
 		opt_list = &client->opt_lists[p->ts.thread_number - 1];
 
 	tsobj = show_thread_status(&p->ts, &p->rs, opt_list, NULL);
-	client->did_stat = 1;
+	client->did_stat = true;
 	if (tsobj) {
 		json_object_add_client_info(tsobj, client);
 		json_array_add_value_object(clients_array, tsobj);
@@ -1123,7 +1127,7 @@
 	struct cmd_du_pdu *du = (struct cmd_du_pdu *) cmd->payload;
 
 	if (!client->disk_stats_shown) {
-		client->disk_stats_shown = 1;
+		client->disk_stats_shown = true;
 		log_info("\nDisk stats (read/write):\n");
 	}
 
@@ -1322,7 +1326,7 @@
 	log_pathname = malloc(10 + strlen((char *)pdu->name) +
 			strlen(client->hostname));
 	if (!log_pathname) {
-		log_err("fio: memory allocation of unique pathname failed");
+		log_err("fio: memory allocation of unique pathname failed\n");
 		return -1;
 	}
 	/* generate a unique pathname for the log file using hostname */
@@ -1450,7 +1454,7 @@
 	z_stream stream;
 	uint32_t nr_samples;
 	size_t total;
-	void *p;
+	char *p;
 
 	stream.zalloc = Z_NULL;
 	stream.zfree = Z_NULL;
@@ -1476,10 +1480,10 @@
 
 	memcpy(ret, pdu, sizeof(*pdu));
 
-	p = (void *) ret + sizeof(*pdu);
+	p = (char *) ret + sizeof(*pdu);
 
 	stream.avail_in = cmd->pdu_len - sizeof(*pdu);
-	stream.next_in = (void *) pdu + sizeof(*pdu);
+	stream.next_in = (void *)((char *) pdu + sizeof(*pdu));
 	while (stream.avail_in) {
 		unsigned int this_chunk = 65536;
 		unsigned int this_len;
@@ -1489,7 +1493,7 @@
 			this_chunk = total;
 
 		stream.avail_out = this_chunk;
-		stream.next_out = p;
+		stream.next_out = (void *)p;
 		err = inflate(&stream, Z_NO_FLUSH);
 		/* may be Z_OK, or Z_STREAM_END */
 		if (err < 0) {
@@ -1564,7 +1568,7 @@
 
 		s = __get_sample(samples, ret->log_offset, i);
 		if (ret->log_type == IO_LOG_TYPE_HIST)
-			s = (struct io_sample *)((void *)s + sizeof(struct io_u_plat_entry) * i);
+			s = (struct io_sample *)((char *)s + sizeof(struct io_u_plat_entry) * i);
 
 		s->time		= le64_to_cpu(s->time);
 		s->data.val	= le64_to_cpu(s->data.val);
@@ -1578,7 +1582,7 @@
 		}
 
 		if (ret->log_type == IO_LOG_TYPE_HIST) {
-			s->data.plat_entry = (struct io_u_plat_entry *)(((void *)s) + sizeof(*s));
+			s->data.plat_entry = (struct io_u_plat_entry *)(((char *)s) + sizeof(*s));
 			s->data.plat_entry->list.next = NULL;
 			s->data.plat_entry->list.prev = NULL;
 		}
@@ -1869,7 +1873,7 @@
 }
 
 static int client_check_cmd_timeout(struct fio_client *client,
-				    struct timeval *now)
+				    struct timespec *now)
 {
 	struct fio_net_cmd_reply *reply;
 	struct flist_head *entry, *tmp;
@@ -1878,7 +1882,7 @@
 	flist_for_each_safe(entry, tmp, &client->cmd_list) {
 		reply = flist_entry(entry, struct fio_net_cmd_reply, list);
 
-		if (mtime_since(&reply->tv, now) < FIO_NET_CLIENT_TIMEOUT)
+		if (mtime_since(&reply->ts, now) < FIO_NET_CLIENT_TIMEOUT)
 			continue;
 
 		if (!handle_cmd_timeout(client, reply))
@@ -1896,10 +1900,10 @@
 {
 	struct fio_client *client;
 	struct flist_head *entry, *tmp;
-	struct timeval tv;
+	struct timespec ts;
 	int ret = 0;
 
-	fio_gettime(&tv, NULL);
+	fio_gettime(&ts, NULL);
 
 	flist_for_each_safe(entry, tmp, &client_list) {
 		client = flist_entry(entry, struct fio_client, list);
@@ -1907,7 +1911,7 @@
 		if (flist_empty(&client->cmd_list))
 			continue;
 
-		if (!client_check_cmd_timeout(client, &tv))
+		if (!client_check_cmd_timeout(client, &ts))
 			continue;
 
 		if (client->ops->timed_out)
@@ -1928,7 +1932,7 @@
 	struct pollfd *pfds;
 	int i, ret = 0, retval = 0;
 
-	fio_gettime(&eta_tv, NULL);
+	fio_gettime(&eta_ts, NULL);
 
 	pfds = malloc(nr_clients * sizeof(struct pollfd));
 
@@ -1960,13 +1964,13 @@
 		assert(i == nr_clients);
 
 		do {
-			struct timeval tv;
+			struct timespec ts;
 			int timeout;
 
-			fio_gettime(&tv, NULL);
-			if (mtime_since(&eta_tv, &tv) >= 900) {
+			fio_gettime(&ts, NULL);
+			if (mtime_since(&eta_ts, &ts) >= 900) {
 				request_client_etas(ops);
-				memcpy(&eta_tv, &tv, sizeof(tv));
+				memcpy(&eta_ts, &ts, sizeof(ts));
 
 				if (fio_check_clients_timed_out())
 					break;
diff -Nru fio-2.16/client.h fio-3.1/client.h
--- fio-2.16/client.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/client.h	2017-09-28 10:23:20.000000000 +0000
@@ -6,6 +6,7 @@
 #include <netinet/in.h>
 #include <arpa/inet.h>
 
+#include "lib/types.h"
 #include "stat.h"
 
 struct fio_net_cmd;
@@ -45,16 +46,16 @@
 
 	int state;
 
-	int skip_newline;
-	int is_sock;
-	int disk_stats_shown;
+	bool skip_newline;
+	bool is_sock;
+	bool disk_stats_shown;
 	unsigned int jobs;
 	unsigned int nr_stat;
 	int error;
 	int signal;
 	int ipv6;
-	int sent_job;
-	int did_stat;
+	bool sent_job;
+	bool did_stat;
 	uint32_t type;
 
 	uint32_t thread_number;
diff -Nru fio-2.16/compiler/compiler.h fio-3.1/compiler/compiler.h
--- fio-2.16/compiler/compiler.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/compiler/compiler.h	2017-09-28 10:23:20.000000000 +0000
@@ -38,10 +38,12 @@
 #if defined(CONFIG_STATIC_ASSERT)
 #define compiletime_assert(condition, msg) _Static_assert(condition, msg)
 
-#else
+#elif !defined(CONFIG_DISABLE_OPTIMIZATIONS)
+
 #ifndef __compiletime_error
 #define __compiletime_error(message)
 #endif
+
 #ifndef __compiletime_error_fallback
 #define __compiletime_error_fallback(condition)	do { } while (0)
 #endif
@@ -61,6 +63,10 @@
 #define compiletime_assert(condition, msg) \
 	_compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 
+#else
+
+#define compiletime_assert(condition, msg)	do { } while (0)
+
 #endif
 
 #endif
diff -Nru fio-2.16/configure fio-3.1/configure
--- fio-2.16/configure	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/configure	2017-09-28 10:23:20.000000000 +0000
@@ -37,6 +37,11 @@
   exit 1
 }
 
+# Print result for each configuration test
+print_config() {
+  printf "%-30s%s\n" "$1" "$2"
+}
+
 # Default CFLAGS
 CFLAGS="-D_GNU_SOURCE -include config-host.h"
 BUILD_CFLAGS=""
@@ -138,6 +143,7 @@
 pmemblk="no"
 devdax="no"
 disable_lex=""
+disable_pmem="no"
 prefix=/usr/local
 
 # parse options
@@ -160,11 +166,12 @@
   ;;
   --build-static) build_static="yes"
   ;;
-  --enable-gfio)
-  gfio_check="yes"
+  --enable-gfio) gfio_check="yes"
   ;;
   --disable-numa) disable_numa="yes"
   ;;
+  --disable-rdma) disable_rdma="yes"
+  ;;
   --disable-rbd) disable_rbd="yes"
   ;;
   --disable-rbd-blkin) disable_rbd_blkin="yes"
@@ -173,10 +180,6 @@
   ;;
   --enable-libhdfs) libhdfs="yes"
   ;;
-  --enable-pmemblk) pmemblk="yes"
-  ;;
-  --enable-devdax) devdax="yes"
-  ;;
   --disable-lex) disable_lex="yes"
   ;;
   --enable-lex) disable_lex="no"
@@ -185,6 +188,10 @@
   ;;
   --disable-optimizations) disable_opt="yes"
   ;;
+  --disable-pmem) disable_pmem="yes"
+  ;;
+  --enable-cuda) enable_cuda="yes"
+  ;;
   --help)
     show_help="yes"
     ;;
@@ -196,23 +203,24 @@
 done
 
 if test "$show_help" = "yes" ; then
-  echo "--prefix=              Use this directory as installation prefix"
-  echo "--cpu=                 Specify target CPU if auto-detect fails"
-  echo "--cc=                  Specify compiler to use"
-  echo "--extra-cflags=        Specify extra CFLAGS to pass to compiler"
-  echo "--build-32bit-win      Enable 32-bit build on Windows"
-  echo "--build-static         Build a static fio"
-  echo "--esx                  Configure build options for esx"
-  echo "--enable-gfio          Enable building of gtk gfio"
-  echo "--disable-numa         Disable libnuma even if found"
-  echo "--disable-gfapi        Disable gfapi"
-  echo "--enable-libhdfs       Enable hdfs support"
-  echo "--enable-pmemblk       Enable NVML libpmemblk support"
-  echo "--enable-devdax        Enable NVM Device Dax support"
-  echo "--disable-lex          Disable use of lex/yacc for math"
-  echo "--enable-lex           Enable use of lex/yacc for math"
-  echo "--disable-shm          Disable SHM support"
+  echo "--prefix=               Use this directory as installation prefix"
+  echo "--cpu=                  Specify target CPU if auto-detect fails"
+  echo "--cc=                   Specify compiler to use"
+  echo "--extra-cflags=         Specify extra CFLAGS to pass to compiler"
+  echo "--build-32bit-win       Enable 32-bit build on Windows"
+  echo "--build-static          Build a static fio"
+  echo "--esx                   Configure build options for esx"
+  echo "--enable-gfio           Enable building of gtk gfio"
+  echo "--disable-numa          Disable libnuma even if found"
+  echo "--disable-rdma          Disable RDMA support even if found"
+  echo "--disable-gfapi         Disable gfapi"
+  echo "--enable-libhdfs        Enable hdfs support"
+  echo "--disable-lex           Disable use of lex/yacc for math"
+  echo "--disable-pmem          Disable pmem based engines even if found"
+  echo "--enable-lex            Enable use of lex/yacc for math"
+  echo "--disable-shm           Disable SHM support"
   echo "--disable-optimizations Don't enable compiler optimizations"
+  echo "--enable-cuda           Enable GPUDirect RDMA support"
   exit $exit_val
 fi
 
@@ -253,8 +261,9 @@
 # cross-compiling to one of these OSes then you'll need to specify
 # the correct CPU with the --cpu option.
 case $targetos in
-AIX)
+AIX|OpenBSD)
   # Unless explicitly enabled, turn off lex.
+  # OpenBSD will hit syntax error when enabled.
   if test -z "$disable_lex" ; then
     disable_lex="yes"
   else
@@ -270,6 +279,17 @@
   if test -z "$cpu" && test "$(sysctl -n hw.optional.x86_64)" = "1"; then
     cpu="x86_64"
   fi
+  # Error at compile time linking of weak/partial symbols if possible...
+cat > $TMPC <<EOF
+int main(void)
+{
+  return 0;
+}
+EOF
+  if compile_prog "" "-Wl,-no_weak_imports" "disable weak symbols"; then
+    echo "Disabling weak symbols"
+    LDFLAGS="$LDFLAGS -Wl,-no_weak_imports"
+  fi
   ;;
 SunOS)
   # `uname -m` returns i86pc even on an x86_64 box, so default based on isainfo
@@ -279,37 +299,49 @@
   LIBS="-lnsl -lsocket"
   ;;
 CYGWIN*)
-  echo "Forcing known good options on Windows"
+  # We still force some options, so keep this message here.
+  echo "Forcing some known good options on Windows"
   if test -z "$CC" ; then
     if test ! -z "$build_32bit_win" && test "$build_32bit_win" = "yes"; then
       CC="i686-w64-mingw32-gcc"
+      if test -e "../zlib/contrib/vstudio/vc14/x86/ZlibStatReleaseWithoutAsm/zlibstat.lib"; then
+        echo "Building with zlib support"
+        output_sym "CONFIG_ZLIB"
+        echo "LIBS=../zlib/contrib/vstudio/vc14/x86/ZlibStatReleaseWithoutAsm/zlibstat.lib" >> $config_host_mak
+      fi
     else
       CC="x86_64-w64-mingw32-gcc"
+      if test -e "../zlib/contrib/vstudio/vc14/x64/ZlibStatReleaseWithoutAsm/zlibstat.lib"; then
+        echo "Building with zlib support"
+        output_sym "CONFIG_ZLIB"
+        echo "LIBS=../zlib/contrib/vstudio/vc14/x64/ZlibStatReleaseWithoutAsm/zlibstat.lib" >> $config_host_mak
+      fi
     fi
   fi
-  output_sym "CONFIG_LITTLE_ENDIAN"
   if test ! -z "$build_32bit_win" && test "$build_32bit_win" = "yes"; then
     output_sym "CONFIG_32BIT"
   else
     output_sym "CONFIG_64BIT_LLP64"
   fi
-  output_sym "CONFIG_FADVISE"
-  output_sym "CONFIG_SOCKLEN_T"
-  output_sym "CONFIG_FADVISE"
-  output_sym "CONFIG_SFAA"
-  output_sym "CONFIG_RUSAGE_THREAD"
+  # We need this to be output_sym'd here because this is Windows specific.
+  # The regular configure path never sets this config.
   output_sym "CONFIG_WINDOWSAIO"
-  output_sym "CONFIG_FDATASYNC"
-  output_sym "CONFIG_CLOCK_MONOTONIC"
-  output_sym "CONFIG_GETTIMEOFDAY"
-  output_sym "CONFIG_CLOCK_GETTIME"
-  output_sym "CONFIG_SCHED_IDLE"
-  output_sym "CONFIG_TCP_NODELAY"
-  output_sym "CONFIG_TLS_THREAD"
-  output_sym "CONFIG_IPV6"
+  # We now take the regular configuration path without having exit 0 here.
+  # Flags below are still necessary mostly for MinGW.
+  socklen_t="yes"
+  sfaa="yes"
+  rusage_thread="yes"
+  fdatasync="yes"
+  clock_gettime="yes" # clock_monotonic probe has dependency on this
+  clock_monotonic="yes"
+  gettimeofday="yes"
+  sched_idle="yes"
+  tcp_nodelay="yes"
+  tls_thread="yes"
+  static_assert="yes"
+  ipv6="yes"
   echo "CC=$CC" >> $config_host_mak
-  echo "BUILD_CFLAGS=$CFLAGS -include config-host.h -D_GNU_SOURCE" >> $config_host_mak
-  exit 0
+  echo "BUILD_CFLAGS=$CFLAGS -I../zlib -include config-host.h -D_GNU_SOURCE" >> $config_host_mak
   ;;
 esac
 
@@ -344,6 +376,8 @@
   fi
 elif check_define __arm__ ; then
   cpu="arm"
+elif check_define __aarch64__ ; then
+  cpu="aarch64"
 elif check_define __hppa__ ; then
   cpu="hppa"
 else
@@ -356,7 +390,7 @@
     cpu="$cpu"
   ;;
   i386|i486|i586|i686|i86pc|BePC)
-    cpu="i386"
+    cpu="x86"
   ;;
   x86_64|amd64)
     cpu="x86_64"
@@ -364,6 +398,9 @@
   armv*b|armv*l|arm)
     cpu="arm"
   ;;
+  aarch64)
+    cpu="arm64"
+  ;;
   hppa|parisc|parisc64)
     cpu="hppa"
   ;;
@@ -393,7 +430,9 @@
 ##########################################
 # check cross compile
 
-cross_compile="no"
+if test "$cross_compile" != "yes" ; then
+  cross_compile="no"
+fi
 cat > $TMPC <<EOF
 int main(void)
 {
@@ -408,7 +447,9 @@
 
 ##########################################
 # check endianness
-bigendian="no"
+if test "$bigendian" != "yes" ; then
+  bigendian="no"
+fi
 if test "$cross_compile" = "no" ; then
   cat > $TMPC <<EOF
 #include <inttypes.h>
@@ -439,11 +480,11 @@
 fi
 
 
-echo "Operating system              $targetos"
-echo "CPU                           $cpu"
-echo "Big endian                    $bigendian"
-echo "Compiler                      $cc"
-echo "Cross compile                 $cross_compile"
+print_config "Operating system" "$targetos"
+print_config "CPU" "$cpu"
+print_config "Big endian" "$bigendian"
+print_config "Compiler" "$cc"
+print_config "Cross compile" "$cross_compile"
 echo
 
 ##########################################
@@ -454,7 +495,7 @@
 else
   build_static="no"
 fi
-echo "Static build                  $build_static"
+print_config "Static build" "$build_static"
 
 ##########################################
 # check for wordsize
@@ -475,11 +516,13 @@
 else
   fatal "Unknown wordsize"
 fi
-echo "Wordsize                      $wordsize"
+print_config "Wordsize" "$wordsize"
 
 ##########################################
 # zlib probe
-zlib="no"
+if test "$zlib" != "yes" ; then
+  zlib="no"
+fi
 cat > $TMPC <<EOF
 #include <zlib.h>
 int main(void)
@@ -494,11 +537,13 @@
   zlib=yes
   LIBS="-lz $LIBS"
 fi
-echo "zlib                          $zlib"
+print_config "zlib" "$zlib"
 
 ##########################################
 # linux-aio probe
-libaio="no"
+if test "$libaio" != "yes" ; then
+  libaio="no"
+fi
 if test "$esx" != "yes" ; then
   cat > $TMPC <<EOF
 #include <libaio.h>
@@ -519,12 +564,16 @@
     libaio=no
   fi
 fi
-echo "Linux AIO support             $libaio"
+print_config "Linux AIO support" "$libaio"
 
 ##########################################
 # posix aio probe
-posix_aio="no"
-posix_aio_lrt="no"
+if test "$posix_aio" != "yes" ; then
+  posix_aio="no"
+fi
+if test "$posix_aio_lrt" != "yes" ; then
+  posix_aio_lrt="no"
+fi
 cat > $TMPC <<EOF
 #include <aio.h>
 int main(void)
@@ -541,12 +590,14 @@
   posix_aio_lrt="yes"
   LIBS="-lrt $LIBS"
 fi
-echo "POSIX AIO support             $posix_aio"
-echo "POSIX AIO support needs -lrt  $posix_aio_lrt"
+print_config "POSIX AIO support" "$posix_aio"
+print_config "POSIX AIO support needs -lrt" "$posix_aio_lrt"
 
 ##########################################
 # posix aio fsync probe
-posix_aio_fsync="no"
+if test "$posix_aio_fsync" != "yes" ; then
+  posix_aio_fsync="no"
+fi
 if test "$posix_aio" = "yes" ; then
   cat > $TMPC <<EOF
 #include <fcntl.h>
@@ -562,11 +613,43 @@
     posix_aio_fsync=yes
   fi
 fi
-echo "POSIX AIO fsync               $posix_aio_fsync"
+print_config "POSIX AIO fsync" "$posix_aio_fsync"
+
+##########################################
+# POSIX pshared attribute probe
+if test "$posix_pshared" != "yes" ; then
+  posix_pshared="no"
+fi
+cat > $TMPC <<EOF
+#include <unistd.h>
+int main(void)
+{
+#if defined(_POSIX_THREAD_PROCESS_SHARED) && ((_POSIX_THREAD_PROCESS_SHARED + 0) > 0)
+# if defined(__CYGWIN__)
+#  error "_POSIX_THREAD_PROCESS_SHARED is buggy on Cygwin"
+# elif defined(__APPLE__)
+#  include <AvailabilityMacros.h>
+#  include <TargetConditionals.h>
+#  if TARGET_OS_MAC && MAC_OS_X_VERSION_MIN_REQUIRED < 1070
+#   error "_POSIX_THREAD_PROCESS_SHARED is buggy/unsupported prior to OSX 10.7"
+#  endif
+# endif
+#else
+# error "_POSIX_THREAD_PROCESS_SHARED is unsupported"
+#endif
+  return 0;
+}
+EOF
+if compile_prog "" "$LIBS" "posix_pshared" ; then
+  posix_pshared=yes
+fi
+print_config "POSIX pshared support" "$posix_pshared"
 
 ##########################################
 # solaris aio probe
-solaris_aio="no"
+if test "$solaris_aio" != "yes" ; then
+  solaris_aio="no"
+fi
 cat > $TMPC <<EOF
 #include <sys/types.h>
 #include <sys/asynch.h>
@@ -582,11 +665,13 @@
   solaris_aio=yes
   LIBS="-laio $LIBS"
 fi
-echo "Solaris AIO support           $solaris_aio"
+print_config "Solaris AIO support" "$solaris_aio"
 
 ##########################################
 # __sync_fetch_and_add test
-sfaa="no"
+if test "$sfaa" != "yes" ; then
+  sfaa="no"
+fi
 cat > $TMPC << EOF
 #include <inttypes.h>
 static int sfaa(uint64_t *ptr)
@@ -604,29 +689,32 @@
 if compile_prog "" "" "__sync_fetch_and_add()" ; then
     sfaa="yes"
 fi
-echo "__sync_fetch_and_add          $sfaa"
+print_config "__sync_fetch_and_add" "$sfaa"
 
 ##########################################
 # libverbs probe
-libverbs="no"
+if test "$libverbs" != "yes" ; then
+  libverbs="no"
+fi
 cat > $TMPC << EOF
-#include <stdio.h>
-#include <infiniband/arch.h>
+#include <infiniband/verbs.h>
 int main(int argc, char **argv)
 {
   struct ibv_pd *pd = ibv_alloc_pd(NULL);
   return 0;
 }
 EOF
-if compile_prog "" "-libverbs" "libverbs" ; then
+if test "$disable_rdma" != "yes" && compile_prog "" "-libverbs" "libverbs" ; then
     libverbs="yes"
     LIBS="-libverbs $LIBS"
 fi
-echo "libverbs                      $libverbs"
+print_config "libverbs" "$libverbs"
 
 ##########################################
 # rdmacm probe
-rdmacm="no"
+if test "$rdmacm" != "yes" ; then
+  rdmacm="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <rdma/rdma_cma.h>
@@ -636,15 +724,17 @@
   return 0;
 }
 EOF
-if compile_prog "" "-lrdmacm" "rdma"; then
+if test "$disable_rdma" != "yes" && compile_prog "" "-lrdmacm" "rdma"; then
     rdmacm="yes"
     LIBS="-lrdmacm $LIBS"
 fi
-echo "rdmacm                        $rdmacm"
+print_config "rdmacm" "$rdmacm"
 
 ##########################################
 # Linux fallocate probe
-linux_fallocate="no"
+if test "$linux_fallocate" != "yes" ; then
+  linux_fallocate="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -658,11 +748,13 @@
 if compile_prog "" "" "linux_fallocate"; then
     linux_fallocate="yes"
 fi
-echo "Linux fallocate               $linux_fallocate"
+print_config "Linux fallocate" "$linux_fallocate"
 
 ##########################################
 # POSIX fadvise probe
-posix_fadvise="no"
+if test "$posix_fadvise" != "yes" ; then
+  posix_fadvise="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -675,11 +767,13 @@
 if compile_prog "" "" "posix_fadvise"; then
     posix_fadvise="yes"
 fi
-echo "POSIX fadvise                 $posix_fadvise"
+print_config "POSIX fadvise" "$posix_fadvise"
 
 ##########################################
 # POSIX fallocate probe
-posix_fallocate="no"
+if test "$posix_fallocate" != "yes" ; then
+  posix_fallocate="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -692,12 +786,16 @@
 if compile_prog "" "" "posix_fallocate"; then
     posix_fallocate="yes"
 fi
-echo "POSIX fallocate               $posix_fallocate"
+print_config "POSIX fallocate" "$posix_fallocate"
 
 ##########################################
 # sched_set/getaffinity 2 or 3 argument test
-linux_2arg_affinity="no"
-linux_3arg_affinity="no"
+if test "$linux_2arg_affinity" != "yes" ; then
+  linux_2arg_affinity="no"
+fi
+if test "$linux_3arg_affinity" != "yes" ; then
+  linux_3arg_affinity="no"
+fi
 cat > $TMPC << EOF
 #include <sched.h>
 int main(int argc, char **argv)
@@ -721,12 +819,14 @@
     linux_2arg_affinity="yes"
   fi
 fi
-echo "sched_setaffinity(3 arg)      $linux_3arg_affinity"
-echo "sched_setaffinity(2 arg)      $linux_2arg_affinity"
+print_config "sched_setaffinity(3 arg)" "$linux_3arg_affinity"
+print_config "sched_setaffinity(2 arg)" "$linux_2arg_affinity"
 
 ##########################################
 # clock_gettime probe
-clock_gettime="no"
+if test "$clock_gettime" != "yes" ; then
+  clock_gettime="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <time.h>
@@ -741,11 +841,13 @@
     clock_gettime="yes"
     LIBS="-lrt $LIBS"
 fi
-echo "clock_gettime                 $clock_gettime"
+print_config "clock_gettime" "$clock_gettime"
 
 ##########################################
 # CLOCK_MONOTONIC probe
-clock_monotonic="no"
+if test "$clock_monotonic" != "yes" ; then
+  clock_monotonic="no"
+fi
 if test "$clock_gettime" = "yes" ; then
   cat > $TMPC << EOF
 #include <stdio.h>
@@ -759,11 +861,13 @@
       clock_monotonic="yes"
   fi
 fi
-echo "CLOCK_MONOTONIC               $clock_monotonic"
+print_config "CLOCK_MONOTONIC" "$clock_monotonic"
 
 ##########################################
 # CLOCK_MONOTONIC_RAW probe
-clock_monotonic_raw="no"
+if test "$clock_monotonic_raw" != "yes" ; then
+  clock_monotonic_raw="no"
+fi
 if test "$clock_gettime" = "yes" ; then
   cat > $TMPC << EOF
 #include <stdio.h>
@@ -777,11 +881,13 @@
       clock_monotonic_raw="yes"
   fi
 fi
-echo "CLOCK_MONOTONIC_RAW           $clock_monotonic_raw"
+print_config "CLOCK_MONOTONIC_RAW" "$clock_monotonic_raw"
 
 ##########################################
 # CLOCK_MONOTONIC_PRECISE probe
-clock_monotonic_precise="no"
+if test "$clock_monotonic_precise" != "yes" ; then
+  clock_monotonic_precise="no"
+fi
 if test "$clock_gettime" = "yes" ; then
   cat > $TMPC << EOF
 #include <stdio.h>
@@ -795,30 +901,33 @@
       clock_monotonic_precise="yes"
   fi
 fi
-echo "CLOCK_MONOTONIC_PRECISE       $clock_monotonic_precise"
+print_config "CLOCK_MONOTONIC_PRECISE" "$clock_monotonic_precise"
 
 ##########################################
 # clockid_t probe
-clockid_t="no"
+if test "$clockid_t" != "yes" ; then
+  clockid_t="no"
+fi
 cat > $TMPC << EOF
-#include <stdio.h>
-#include <string.h>
 #include <time.h>
+#include <string.h>
 int main(int argc, char **argv)
 {
-  clockid_t cid;
-  memset(&cid, 0, sizeof(cid));
-  return clock_gettime(cid, NULL);
+  volatile clockid_t cid;
+  memset((void*)&cid, 0, sizeof(cid));
+  return 0;
 }
 EOF
 if compile_prog "" "$LIBS" "clockid_t"; then
   clockid_t="yes"
 fi
-echo "clockid_t                     $clockid_t"
+print_config "clockid_t" "$clockid_t"
 
 ##########################################
 # gettimeofday() probe
-gettimeofday="no"
+if test "$gettimeofday" != "yes" ; then
+  gettimeofday="no"
+fi
 cat > $TMPC << EOF
 #include <sys/time.h>
 #include <stdio.h>
@@ -831,11 +940,13 @@
 if compile_prog "" "" "gettimeofday"; then
     gettimeofday="yes"
 fi
-echo "gettimeofday                  $gettimeofday"
+print_config "gettimeofday" "$gettimeofday"
 
 ##########################################
 # fdatasync() probe
-fdatasync="no"
+if test "$fdatasync" != "yes" ; then
+  fdatasync="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <unistd.h>
@@ -847,11 +958,13 @@
 if compile_prog "" "" "fdatasync"; then
   fdatasync="yes"
 fi
-echo "fdatasync                     $fdatasync"
+print_config "fdatasync" "$fdatasync"
 
 ##########################################
 # sync_file_range() probe
-sync_file_range="no"
+if test "$sync_file_range" != "yes" ; then
+  sync_file_range="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <unistd.h>
@@ -867,11 +980,13 @@
 if compile_prog "" "" "sync_file_range"; then
   sync_file_range="yes"
 fi
-echo "sync_file_range               $sync_file_range"
+print_config "sync_file_range" "$sync_file_range"
 
 ##########################################
 # ext4 move extent probe
-ext4_me="no"
+if test "$ext4_me" != "yes" ; then
+  ext4_me="no"
+fi
 cat > $TMPC << EOF
 #include <fcntl.h>
 #include <sys/ioctl.h>
@@ -889,11 +1004,13 @@
   # work. Takes a while to bubble back.
   ext4_me="yes"
 fi
-echo "EXT4 move extent              $ext4_me"
+print_config "EXT4 move extent" "$ext4_me"
 
 ##########################################
 # splice probe
-linux_splice="no"
+if test "$linux_splice" != "yes" ; then
+  linux_splice="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -905,11 +1022,13 @@
 if compile_prog "" "" "linux splice"; then
   linux_splice="yes"
 fi
-echo "Linux splice(2)               $linux_splice"
+print_config "Linux splice(2)" "$linux_splice"
 
 ##########################################
 # GUASI probe
-guasi="no"
+if test "$guasi" != "yes" ; then
+  guasi="no"
+fi
 cat > $TMPC << EOF
 #include <guasi.h>
 #include <guasi_syscalls.h>
@@ -922,11 +1041,13 @@
 if compile_prog "" "" "guasi"; then
   guasi="yes"
 fi
-echo "GUASI                         $guasi"
+print_config "GUASI" "$guasi"
 
 ##########################################
 # fusion-aw probe
-fusion_aw="no"
+if test "$fusion_aw" != "yes" ; then
+  fusion_aw="no"
+fi
 cat > $TMPC << EOF
 #include <nvm/nvm_primitives.h>
 int main(int argc, char **argv)
@@ -942,11 +1063,13 @@
   LIBS="-L/usr/lib/fio -L/usr/lib/nvm -lnvm-primitives -ldl -lpthread $LIBS"
   fusion_aw="yes"
 fi
-echo "Fusion-io atomic engine       $fusion_aw"
+print_config "Fusion-io atomic engine" "$fusion_aw"
 
 ##########################################
 # libnuma probe
-libnuma="no"
+if test "$libnuma" != "yes" ; then
+  libnuma="no"
+fi
 cat > $TMPC << EOF
 #include <numa.h>
 int main(int argc, char **argv)
@@ -958,10 +1081,10 @@
   libnuma="yes"
   LIBS="-lnuma $LIBS"
 fi
-echo "libnuma                       $libnuma"
+print_config "libnuma" "$libnuma"
 
 ##########################################
-# libnuma 2.x version API
+# libnuma 2.x version API, initialize with "no" only if $libnuma is set to "yes"
 if test "$libnuma" = "yes" ; then
 libnuma_v2="no"
 cat > $TMPC << EOF
@@ -975,12 +1098,14 @@
 if compile_prog "" "" "libnuma api"; then
   libnuma_v2="yes"
 fi
-echo "libnuma v2                    $libnuma_v2"
+print_config "libnuma v2" "$libnuma_v2"
 fi
 
 ##########################################
 # strsep() probe
-strsep="no"
+if test "$strsep" != "yes" ; then
+  strsep="no"
+fi
 cat > $TMPC << EOF
 #include <string.h>
 int main(int argc, char **argv)
@@ -993,11 +1118,13 @@
 if compile_prog "" "" "strsep"; then
   strsep="yes"
 fi
-echo "strsep                        $strsep"
+print_config "strsep" "$strsep"
 
 ##########################################
 # strcasestr() probe
-strcasestr="no"
+if test "$strcasestr" != "yes" ; then
+  strcasestr="no"
+fi
 cat > $TMPC << EOF
 #include <string.h>
 int main(int argc, char **argv)
@@ -1008,11 +1135,13 @@
 if compile_prog "" "" "strcasestr"; then
   strcasestr="yes"
 fi
-echo "strcasestr                    $strcasestr"
+print_config "strcasestr" "$strcasestr"
 
 ##########################################
 # strlcat() probe
-strlcat="no"
+if test "$strlcat" != "yes" ; then
+  strlcat="no"
+fi
 cat > $TMPC << EOF
 #include <string.h>
 int main(int argc, char **argv)
@@ -1027,11 +1156,13 @@
 if compile_prog "" "" "strlcat"; then
   strlcat="yes"
 fi
-echo "strlcat                       $strlcat"
+print_config "strlcat" "$strlcat"
 
 ##########################################
 # getopt_long_only() probe
-getopt_long_only="no"
+if test "$getopt_long_only" != "yes" ; then
+  getopt_long_only="no"
+fi
 cat > $TMPC << EOF
 #include <unistd.h>
 #include <stdio.h>
@@ -1045,11 +1176,13 @@
 if compile_prog "" "" "getopt_long_only"; then
   getopt_long_only="yes"
 fi
-echo "getopt_long_only()            $getopt_long_only"
+print_config "getopt_long_only()" "$getopt_long_only"
 
 ##########################################
 # inet_aton() probe
-inet_aton="no"
+if test "$inet_aton" != "yes" ; then
+  inet_aton="no"
+fi
 cat > $TMPC << EOF
 #include <sys/socket.h>
 #include <arpa/inet.h>
@@ -1063,11 +1196,13 @@
 if compile_prog "" "" "inet_aton"; then
   inet_aton="yes"
 fi
-echo "inet_aton                     $inet_aton"
+print_config "inet_aton" "$inet_aton"
 
 ##########################################
 # socklen_t probe
-socklen_t="no"
+if test "$socklen_t" != "yes" ; then
+  socklen_t="no"
+fi
 cat > $TMPC << EOF
 #include <sys/socket.h>
 int main(int argc, char **argv)
@@ -1079,11 +1214,13 @@
 if compile_prog "" "" "socklen_t"; then
   socklen_t="yes"
 fi
-echo "socklen_t                     $socklen_t"
+print_config "socklen_t" "$socklen_t"
 
 ##########################################
 # Whether or not __thread is supported for TLS
-tls_thread="no"
+if test "$tls_thread" != "yes" ; then
+  tls_thread="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 static __thread int ret;
@@ -1095,11 +1232,13 @@
 if compile_prog "" "" "__thread"; then
   tls_thread="yes"
 fi
-echo "__thread                      $tls_thread"
+print_config "__thread" "$tls_thread"
 
 ##########################################
 # Check if we have required gtk/glib support for gfio
-gfio="no"
+if test "$gfio" != "yes" ; then
+  gfio="no"
+fi
 if test "$gfio_check" = "yes" ; then
   cat > $TMPC << EOF
 #include <glib.h>
@@ -1110,7 +1249,7 @@
   gdk_threads_enter();
   gdk_threads_leave();
 
-  printf("%d", GTK_CHECK_VERSION(2, 18, 0));
+  return GTK_CHECK_VERSION(2, 18, 0) ? 0 : 1; /* 0 on success */
 }
 EOF
 GTK_CFLAGS=$(pkg-config --cflags gtk+-2.0 gthread-2.0)
@@ -1126,8 +1265,8 @@
   exit 1
 fi
 if compile_prog "$GTK_CFLAGS" "$GTK_LIBS" "gfio" ; then
-  r=$($TMPE)
-  if test "$r" != "0" ; then
+  $TMPE
+  if test "$?" = "0" ; then
     gfio="yes"
     GFIO_LIBS="$LIBS $GTK_LIBS"
     CFLAGS="$CFLAGS $GTK_CFLAGS"
@@ -1143,11 +1282,14 @@
 fi
 
 if test "$gfio_check" = "yes" ; then
-  echo "gtk 2.18 or higher            $gfio"
+  print_config "gtk 2.18 or higher" "$gfio"
 fi
 
+##########################################
 # Check whether we have getrusage(RUSAGE_THREAD)
-rusage_thread="no"
+if test "$rusage_thread" != "yes" ; then
+  rusage_thread="no"
+fi
 cat > $TMPC << EOF
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -1161,11 +1303,13 @@
 if compile_prog "" "" "RUSAGE_THREAD"; then
   rusage_thread="yes"
 fi
-echo "RUSAGE_THREAD                 $rusage_thread"
+print_config "RUSAGE_THREAD" "$rusage_thread"
 
 ##########################################
 # Check whether we have SCHED_IDLE
-sched_idle="no"
+if test "$sched_idle" != "yes" ; then
+  sched_idle="no"
+fi
 cat > $TMPC << EOF
 #include <sched.h>
 int main(int argc, char **argv)
@@ -1177,11 +1321,13 @@
 if compile_prog "" "" "SCHED_IDLE"; then
   sched_idle="yes"
 fi
-echo "SCHED_IDLE                    $sched_idle"
+print_config "SCHED_IDLE" "$sched_idle"
 
 ##########################################
 # Check whether we have TCP_NODELAY
-tcp_nodelay="no"
+if test "$tcp_nodelay" != "yes" ; then
+  tcp_nodelay="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/types.h>
@@ -1195,11 +1341,13 @@
 if compile_prog "" "" "TCP_NODELAY"; then
   tcp_nodelay="yes"
 fi
-echo "TCP_NODELAY                   $tcp_nodelay"
+print_config "TCP_NODELAY" "$tcp_nodelay"
 
 ##########################################
 # Check whether we have SO_SNDBUF
-window_size="no"
+if test "$window_size" != "yes" ; then
+  window_size="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/types.h>
@@ -1214,11 +1362,13 @@
 if compile_prog "" "" "SO_SNDBUF"; then
   window_size="yes"
 fi
-echo "Net engine window_size        $window_size"
+print_config "Net engine window_size" "$window_size"
 
 ##########################################
 # Check whether we have TCP_MAXSEG
-mss="no"
+if test "$mss" != "yes" ; then
+  mss="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/types.h>
@@ -1234,11 +1384,13 @@
 if compile_prog "" "" "TCP_MAXSEG"; then
   mss="yes"
 fi
-echo "TCP_MAXSEG                    $mss"
+print_config "TCP_MAXSEG" "$mss"
 
 ##########################################
 # Check whether we have RLIMIT_MEMLOCK
-rlimit_memlock="no"
+if test "$rlimit_memlock" != "yes" ; then
+  rlimit_memlock="no"
+fi
 cat > $TMPC << EOF
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -1251,11 +1403,13 @@
 if compile_prog "" "" "RLIMIT_MEMLOCK"; then
   rlimit_memlock="yes"
 fi
-echo "RLIMIT_MEMLOCK                $rlimit_memlock"
+print_config "RLIMIT_MEMLOCK" "$rlimit_memlock"
 
 ##########################################
 # Check whether we have pwritev/preadv
-pwritev="no"
+if test "$pwritev" != "yes" ; then
+  pwritev="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/uio.h>
@@ -1267,11 +1421,13 @@
 if compile_prog "" "" "pwritev"; then
   pwritev="yes"
 fi
-echo "pwritev/preadv                $pwritev"
+print_config "pwritev/preadv" "$pwritev"
 
 ##########################################
 # Check whether we have pwritev2/preadv2
-pwritev2="no"
+if test "$pwritev2" != "yes" ; then
+  pwritev2="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/uio.h>
@@ -1283,11 +1439,13 @@
 if compile_prog "" "" "pwritev2"; then
   pwritev2="yes"
 fi
-echo "pwritev2/preadv2              $pwritev2"
+print_config "pwritev2/preadv2" "$pwritev2"
 
 ##########################################
 # Check whether we have the required functions for ipv6
-ipv6="no"
+if test "$ipv6" != "yes" ; then
+  ipv6="no"
+fi
 cat > $TMPC << EOF
 #include <sys/types.h>
 #include <sys/socket.h>
@@ -1310,25 +1468,30 @@
 if compile_prog "" "" "ipv6"; then
   ipv6="yes"
 fi
-echo "IPv6 helpers                  $ipv6"
+print_config "IPv6 helpers" "$ipv6"
 
 ##########################################
 # check for rbd
-rbd="no"
+if test "$rbd" != "yes" ; then
+  rbd="no"
+fi
 cat > $TMPC << EOF
 #include <rbd/librbd.h>
 
 int main(int argc, char **argv)
 {
-
   rados_t cluster;
   rados_ioctx_t io_ctx;
+  const char cluster_name[] = "ceph";
+  const char user_name[] = "client.admin";
   const char pool[] = "rbd";
-
   int major, minor, extra;
-  rbd_version(&major, &minor, &extra);
 
+  rbd_version(&major, &minor, &extra);
+  /* The rados_create2 signature required was only introduced in ceph 0.65 */
+  rados_create2(&cluster, cluster_name, user_name, 0);
   rados_ioctx_create(cluster, pool, &io_ctx);
+
   return 0;
 }
 EOF
@@ -1336,11 +1499,13 @@
   LIBS="-lrbd -lrados $LIBS"
   rbd="yes"
 fi
-echo "Rados Block Device engine     $rbd"
+print_config "Rados Block Device engine" "$rbd"
 
 ##########################################
 # check for rbd_poll
-rbd_poll="no"
+if test "$rbd_poll" != "yes" ; then
+  rbd_poll="no"
+fi
 if test "$rbd" = "yes"; then
 cat > $TMPC << EOF
 #include <rbd/librbd.h>
@@ -1361,12 +1526,14 @@
 if compile_prog "" "-lrbd -lrados" "rbd"; then
   rbd_poll="yes"
 fi
-echo "rbd_poll                      $rbd_poll"
+print_config "rbd_poll" "$rbd_poll"
 fi
 
 ##########################################
 # check for rbd_invaidate_cache()
-rbd_inval="no"
+if test "$rbd_inval" != "yes" ; then
+  rbd_inval="no"
+fi
 if test "$rbd" = "yes"; then
 cat > $TMPC << EOF
 #include <rbd/librbd.h>
@@ -1381,12 +1548,14 @@
 if compile_prog "" "-lrbd -lrados" "rbd"; then
   rbd_inval="yes"
 fi
-echo "rbd_invalidate_cache          $rbd_inval"
+print_config "rbd_invalidate_cache" "$rbd_inval"
 fi
 
 ##########################################
 # check for blkin
-rbd_blkin="no"
+if test "$rbd_blkin" != "yes" ; then
+  rbd_blkin="no"
+fi
 cat > $TMPC << EOF
 #include <rbd/librbd.h>
 #include <zipkin_c.h>
@@ -1410,11 +1579,13 @@
   LIBS="-lblkin $LIBS"
   rbd_blkin="yes"
 fi
-echo "rbd blkin tracing             $rbd_blkin"
+print_config "rbd blkin tracing" "$rbd_blkin"
 
 ##########################################
 # Check whether we have setvbuf
-setvbuf="no"
+if test "$setvbuf" != "yes" ; then
+  setvbuf="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 int main(int argc, char **argv)
@@ -1428,16 +1599,18 @@
 if compile_prog "" "" "setvbuf"; then
   setvbuf="yes"
 fi
-echo "setvbuf                       $setvbuf"
+print_config "setvbuf" "$setvbuf"
 
+##########################################
 # check for gfapi
-gfapi="no"
+if test "$gfapi" != "yes" ; then
+  gfapi="no"
+fi
 cat > $TMPC << EOF
 #include <glusterfs/api/glfs.h>
 
 int main(int argc, char **argv)
 {
-
   glfs_t *g = glfs_new("foo");
 
   return 0;
@@ -1447,10 +1620,10 @@
   LIBS="-lgfapi -lglusterfs $LIBS"
   gfapi="yes"
 fi
- echo "Gluster API engine            $gfapi"
+print_config "Gluster API engine" "$gfapi"
 
 ##########################################
-# check for gfapi fadvise support
+# check for gfapi fadvise support, initialize with "no" only if $gfapi is set to "yes"
 if test "$gfapi" = "yes" ; then
 gf_fadvise="no"
 cat > $TMPC << EOF
@@ -1467,12 +1640,14 @@
 if compile_prog "" "-lgfapi -lglusterfs" "gfapi"; then
   gf_fadvise="yes"
 fi
-echo "Gluster API use fadvise       $gf_fadvise"
+print_config "Gluster API use fadvise" "$gf_fadvise"
 fi
 
 ##########################################
 # check for gfapi trim support
-gf_trim="no"
+if test "$gf_trim" != "yes" ; then
+  gf_trim="no"
+fi
 if test "$gfapi" = "yes" ; then
 cat > $TMPC << EOF
 #include <glusterfs/api/glfs.h>
@@ -1485,12 +1660,14 @@
 if compile_prog "" "-lgfapi -lglusterfs" "gf trim"; then
   gf_trim="yes"
 fi
-echo "Gluster API trim support      $gf_trim"
+print_config "Gluster API trim support" "$gf_trim"
 fi
 
 ##########################################
 # Check if we support stckf on s390
-s390_z196_facilities="no"
+if test "$s390_z196_facilities" != "yes" ; then
+  s390_z196_facilities="no"
+fi
 cat > $TMPC << EOF
 #define STFLE_BITS_Z196 45 /* various z196 facilities ... */
 int main(int argc, char **argv)
@@ -1513,11 +1690,11 @@
 EOF
 if compile_prog "" "" "s390_z196_facilities"; then
   $TMPE
-  if [[ $? -eq 0 ]]; then
+  if [ $? -eq 0 ]; then
   	s390_z196_facilities="yes"
   fi
 fi
-echo "s390_z196_facilities          $s390_z196_facilities"
+print_config "s390_z196_facilities" "$s390_z196_facilities"
 
 ##########################################
 # Check if we have required environment variables configured for libhdfs
@@ -1543,11 +1720,13 @@
     FIO_HDFS_CPU="amd64"
   fi
 fi
-echo "HDFS engine                   $libhdfs"
+print_config "HDFS engine" "$libhdfs"
 
 ##########################################
 # Check whether we have MTD
-mtd="no"
+if test "$mtd" != "yes" ; then
+  mtd="no"
+fi
 cat > $TMPC << EOF
 #include <string.h>
 #include <mtd/mtd-user.h>
@@ -1564,16 +1743,68 @@
 if compile_prog "" "" "mtd"; then
   mtd="yes"
 fi
-echo "MTD                           $mtd"
+print_config "MTD" "$mtd"
+
+##########################################
+# Check whether we have libpmem
+if test "$libpmem" != "yes" ; then
+  libpmem="no"
+fi
+cat > $TMPC << EOF
+#include <libpmem.h>
+int main(int argc, char **argv)
+{
+  int rc;
+  rc = pmem_is_pmem(0, 0);
+  return 0;
+}
+EOF
+if compile_prog "" "-lpmem" "libpmem"; then
+  libpmem="yes"
+  LIBS="-lpmem $LIBS"
+fi
+print_config "libpmem" "$libpmem"
+
+##########################################
+# Check whether we have libpmemblk
+# libpmem is a prerequisite
+if test "$libpmemblk" != "yes" ; then
+  libpmemblk="no"
+fi
+if test "$libpmem" = "yes"; then
+  cat > $TMPC << EOF
+#include <libpmemblk.h>
+int main(int argc, char **argv)
+{
+  PMEMblkpool *pbp;
+  pbp = pmemblk_open("", 0);
+  return 0;
+}
+EOF
+  if compile_prog "" "-lpmemblk" "libpmemblk"; then
+    libpmemblk="yes"
+    LIBS="-lpmemblk $LIBS"
+  fi
+fi
+print_config "libpmemblk" "$libpmemblk"
+
+# Choose the ioengines
+if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
+  devdax="yes"
+  if test "$libpmemblk" = "yes"; then
+    pmemblk="yes"
+  fi
+fi
 
 ##########################################
 # Report whether pmemblk engine is enabled
-echo "NVML libpmemblk engine        $pmemblk"
+print_config "NVML pmemblk engine" "$pmemblk"
 
 ##########################################
 # Report whether dev-dax engine is enabled
-echo "NVM Device Dax engine        $devdax"
+print_config "NVML dev-dax engine" "$devdax"
 
+##########################################
 # Check if we have lex/yacc available
 yacc="no"
 yacc_is_bison="no"
@@ -1632,11 +1863,13 @@
 fi
 fi
 
-echo "lex/yacc for arithmetic       $arith"
+print_config "lex/yacc for arithmetic" "$arith"
 
 ##########################################
 # Check whether we have setmntent/getmntent
-getmntent="no"
+if test "$getmntent" != "yes" ; then
+  getmntent="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <mntent.h>
@@ -1651,7 +1884,7 @@
 if compile_prog "" "" "getmntent"; then
   getmntent="yes"
 fi
-echo "getmntent                     $getmntent"
+print_config "getmntent" "$getmntent"
 
 ##########################################
 # Check whether we have getmntinfo
@@ -1660,7 +1893,9 @@
 
 # getmntinfo(3) for FreeBSD/DragonFlyBSD/OpenBSD.
 # Note that NetBSD needs -Werror to catch warning as error.
-getmntinfo="no"
+if test "$getmntinfo" != "yes" ; then
+  getmntinfo="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/param.h>
@@ -1674,10 +1909,12 @@
 if compile_prog "-Werror" "" "getmntinfo"; then
   getmntinfo="yes"
 fi
-echo "getmntinfo                    $getmntinfo"
+print_config "getmntinfo" "$getmntinfo"
 
 # getmntinfo(3) for NetBSD.
-getmntinfo_statvfs="no"
+if test "$getmntinfo_statvfs" != "yes" ; then
+  getmntinfo_statvfs="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/statvfs.h>
@@ -1690,25 +1927,18 @@
 # Skip the test if the one with statfs arg is detected.
 if test "$getmntinfo" != "yes" && compile_prog "-Werror" "" "getmntinfo_statvfs"; then
   getmntinfo_statvfs="yes"
-  echo "getmntinfo_statvfs            $getmntinfo_statvfs"
+  print_config "getmntinfo_statvfs" "$getmntinfo_statvfs"
 fi
 
 ##########################################
 # Check whether we have _Static_assert
-static_assert="no"
+if test "$static_assert" != "yes" ; then
+  static_assert="no"
+fi
 cat > $TMPC << EOF
 #include <assert.h>
 #include <stdlib.h>
-#undef offsetof
-#ifdef __compiler_offsetof
-#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
-#else
-#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
-#endif
-
-#define container_of(ptr, type, member) ({			\
-	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
-	(type *)( (char *)__mptr - offsetof(type,member) );})
+#include <stddef.h>
 
 struct foo {
   int a, b;
@@ -1723,11 +1953,13 @@
 if compile_prog "" "" "static_assert"; then
     static_assert="yes"
 fi
-echo "Static Assert                 $static_assert"
+print_config "Static Assert" "$static_assert"
 
 ##########################################
 # Check whether we have bool / stdbool.h
-have_bool="no"
+if test "$have_bool" != "yes" ; then
+  have_bool="no"
+fi
 cat > $TMPC << EOF
 #include <stdbool.h>
 int main(int argc, char **argv)
@@ -1739,7 +1971,67 @@
 if compile_prog "" "" "bool"; then
   have_bool="yes"
 fi
-echo "bool                          $have_bool"
+print_config "bool" "$have_bool"
+
+##########################################
+# Check whether we have strndup()
+strndup="no"
+cat > $TMPC << EOF
+#include <string.h>
+#include <stdlib.h>
+int main(int argc, char **argv)
+{
+  char *res = strndup("test string", 8);
+
+  free(res);
+  return 0;
+}
+EOF
+if compile_prog "" "" "strndup"; then
+  strndup="yes"
+fi
+print_config "strndup" "$strndup"
+
+##########################################
+# check march=armv8-a+crc+crypto
+if test "$march_armv8_a_crc_crypto" != "yes" ; then
+  march_armv8_a_crc_crypto="no"
+fi
+if test "$cpu" = "arm64" ; then
+  cat > $TMPC <<EOF
+#include <sys/auxv.h>
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+int main(void)
+{
+  return 0;
+}
+EOF
+  if compile_prog "-march=armv8-a+crc+crypto" "" ""; then
+    march_armv8_a_crc_crypto="yes"
+    CFLAGS="$CFLAGS -march=armv8-a+crc+crypto -DARCH_HAVE_CRC_CRYPTO"
+  fi
+fi
+print_config "march_armv8_a_crc_crypto" "$march_armv8_a_crc_crypto"
+
+##########################################
+# cuda probe
+if test "$cuda" != "yes" ; then
+  cuda="no"
+fi
+cat > $TMPC << EOF
+#include <cuda.h>
+int main(int argc, char **argv)
+{
+  return cuInit(0);
+}
+EOF
+if test "$enable_cuda" = "yes" && compile_prog "" "-lcuda" "cuda"; then
+  cuda="yes"
+  LIBS="-lcuda $LIBS"
+fi
+print_config "cuda" "$cuda"
 
 #############################################################################
 
@@ -1767,6 +2059,9 @@
 if test "$posix_aio_fsync" = "yes" ; then
   output_sym "CONFIG_POSIXAIO_FSYNC"
 fi
+if test "$posix_pshared" = "yes" ; then
+  output_sym "CONFIG_PSHARED"
+fi
 if test "$linux_fallocate" = "yes" ; then
   output_sym "CONFIG_LINUX_FALLOCATE"
 fi
@@ -1854,7 +2149,7 @@
   output_sym "CONFIG_RUSAGE_THREAD"
 fi
 if test "$gfio" = "yes" ; then
-  echo "CONFIG_GFIO=y" >> $config_host_mak
+  output_sym "CONFIG_GFIO"
 fi
 if test "$esx" = "yes" ; then
   output_sym "CONFIG_ESX"
@@ -1954,10 +2249,18 @@
 if test "$have_bool" = "yes" ; then
   output_sym "CONFIG_HAVE_BOOL"
 fi
-
+if test "$strndup" = "yes" ; then
+  output_sym "CONFIG_HAVE_STRNDUP"
+fi
+if test "$disable_opt" = "yes" ; then
+  output_sym "CONFIG_DISABLE_OPTIMIZATIONS"
+fi
 if test "$zlib" = "no" ; then
   echo "Consider installing zlib-dev (zlib-devel), some fio features depend on it."
 fi
+if test "$cuda" = "yes" ; then
+  output_sym "CONFIG_CUDA"
+fi
 
 echo "LIBS+=$LIBS" >> $config_host_mak
 echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
diff -Nru fio-2.16/crc/crc32c-arm64.c fio-3.1/crc/crc32c-arm64.c
--- fio-2.16/crc/crc32c-arm64.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/crc/crc32c-arm64.c	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,114 @@
+#include "crc32c.h"
+
+#define CRC32C3X8(ITR) \
+	crc1 = __crc32cd(crc1, *((const uint64_t *)data + 42*1 + (ITR)));\
+	crc2 = __crc32cd(crc2, *((const uint64_t *)data + 42*2 + (ITR)));\
+	crc0 = __crc32cd(crc0, *((const uint64_t *)data + 42*0 + (ITR)));
+
+#define CRC32C7X3X8(ITR) do {\
+	CRC32C3X8((ITR)*7+0) \
+	CRC32C3X8((ITR)*7+1) \
+	CRC32C3X8((ITR)*7+2) \
+	CRC32C3X8((ITR)*7+3) \
+	CRC32C3X8((ITR)*7+4) \
+	CRC32C3X8((ITR)*7+5) \
+	CRC32C3X8((ITR)*7+6) \
+	} while(0)
+
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32             (1 << 7)
+#endif /* HWCAP_CRC32 */
+
+bool crc32c_arm64_available = false;
+
+#ifdef ARCH_HAVE_ARM64_CRC_CRYPTO
+
+#include <sys/auxv.h>
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+static bool crc32c_probed;
+
+/*
+ * Function to calculate reflected crc with PMULL Instruction
+ * crc done "by 3" for fixed input block size of 1024 bytes
+ */
+uint32_t crc32c_arm64(unsigned char const *data, unsigned long length)
+{
+	signed long len = length;
+	uint32_t crc = ~0;
+	uint32_t crc0, crc1, crc2;
+
+	/* Load two consts: K1 and K2 */
+	const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
+	uint64_t t0, t1;
+
+	while ((len -= 1024) >= 0) {
+		/* Do first 8 bytes here for better pipelining */
+		crc0 = __crc32cd(crc, *(const uint64_t *)data);
+		crc1 = 0;
+		crc2 = 0;
+		data += sizeof(uint64_t);
+
+		/* Process block inline
+		   Process crc0 last to avoid dependency with above */
+		CRC32C7X3X8(0);
+		CRC32C7X3X8(1);
+		CRC32C7X3X8(2);
+		CRC32C7X3X8(3);
+		CRC32C7X3X8(4);
+		CRC32C7X3X8(5);
+
+		data += 42*3*sizeof(uint64_t);
+
+		/* Merge crc0 and crc1 into crc2
+		   crc1 multiply by K2
+		   crc0 multiply by K1 */
+
+		t1 = (uint64_t)vmull_p64(crc1, k2);
+		t0 = (uint64_t)vmull_p64(crc0, k1);
+		crc = __crc32cd(crc2, *(const uint64_t *)data);
+		crc1 = __crc32cd(0, t1);
+		crc ^= crc1;
+		crc0 = __crc32cd(0, t0);
+		crc ^= crc0;
+
+		data += sizeof(uint64_t);
+	}
+
+	if (!(len += 1024))
+		return crc;
+
+	while ((len -= sizeof(uint64_t)) >= 0) {
+                crc = __crc32cd(crc, *(const uint64_t *)data);
+                data += sizeof(uint64_t);
+        }
+
+        /* The following is more efficient than the straight loop */
+        if (len & sizeof(uint32_t)) {
+                crc = __crc32cw(crc, *(const uint32_t *)data);
+                data += sizeof(uint32_t);
+        }
+        if (len & sizeof(uint16_t)) {
+                crc = __crc32ch(crc, *(const uint16_t *)data);
+                data += sizeof(uint16_t);
+        }
+        if (len & sizeof(uint8_t)) {
+                crc = __crc32cb(crc, *(const uint8_t *)data);
+        }
+
+	return crc;
+}
+
+void crc32c_arm64_probe(void)
+{
+	unsigned long hwcap;
+
+	if (!crc32c_probed) {
+		hwcap = getauxval(AT_HWCAP);
+		crc32c_arm64_available = (hwcap & HWCAP_CRC32) != 0;
+		crc32c_probed = true;
+	}
+}
+
+#endif /* ARCH_HAVE_ARM64_CRC_CRYPTO */
diff -Nru fio-2.16/crc/crc32c.h fio-3.1/crc/crc32c.h
--- fio-2.16/crc/crc32c.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/crc/crc32c.h	2017-09-28 10:23:20.000000000 +0000
@@ -19,9 +19,21 @@
 #define CRC32C_H
 
 #include "../arch/arch.h"
+#include "../lib/types.h"
 
 extern uint32_t crc32c_sw(unsigned char const *, unsigned long);
-extern int crc32c_intel_available;
+extern bool crc32c_arm64_available;
+extern bool crc32c_intel_available;
+
+#ifdef ARCH_HAVE_ARM64_CRC_CRYPTO
+extern uint32_t crc32c_arm64(unsigned char const *, unsigned long);
+extern void crc32c_arm64_probe(void);
+#else
+#define crc32c_arm64 crc32c_sw
+static inline void crc32c_arm64_probe(void)
+{
+}
+#endif
 
 #ifdef ARCH_HAVE_SSE4_2
 extern uint32_t crc32c_intel(unsigned char const *, unsigned long);
@@ -35,6 +47,9 @@
 
 static inline uint32_t fio_crc32c(unsigned char const *buf, unsigned long len)
 {
+	if (crc32c_arm64_available)
+		return crc32c_arm64(buf, len);
+
 	if (crc32c_intel_available)
 		return crc32c_intel(buf, len);
 
diff -Nru fio-2.16/crc/crc32c-intel.c fio-3.1/crc/crc32c-intel.c
--- fio-2.16/crc/crc32c-intel.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/crc/crc32c-intel.c	2017-09-28 10:23:20.000000000 +0000
@@ -18,7 +18,7 @@
  * Volume 2A: Instruction Set Reference, A-M
  */
 
-int crc32c_intel_available = 0;
+bool crc32c_intel_available = false;
 
 #ifdef ARCH_HAVE_SSE4_2
 
@@ -30,7 +30,7 @@
 #define SCALE_F 4
 #endif
 
-static int crc32c_probed;
+static bool crc32c_probed;
 
 static uint32_t crc32c_intel_le_hw_byte(uint32_t crc, unsigned char const *data,
 					unsigned long length)
@@ -87,7 +87,7 @@
 
 		do_cpuid(&eax, &ebx, &ecx, &edx);
 		crc32c_intel_available = (ecx & (1 << 20)) != 0;
-		crc32c_probed = 1;
+		crc32c_probed = true;
 	}
 }
 
diff -Nru fio-2.16/crc/fnv.c fio-3.1/crc/fnv.c
--- fio-2.16/crc/fnv.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/crc/fnv.c	2017-09-28 10:23:20.000000000 +0000
@@ -2,14 +2,32 @@
 
 #define FNV_PRIME	0x100000001b3ULL
 
+/*
+ * 64-bit fnv, but don't require 64-bit multiples of data. Use bytes
+ * for the last unaligned chunk.
+ */
 uint64_t fnv(const void *buf, uint32_t len, uint64_t hval)
 {
 	const uint64_t *ptr = buf;
-	const uint64_t *end = (void *) buf + len;
 
-	while (ptr < end) {
+	while (len) {
 		hval *= FNV_PRIME;
-		hval ^= (uint64_t) *ptr++;
+		if (len >= sizeof(uint64_t)) {
+			hval ^= (uint64_t) *ptr++;
+			len -= sizeof(uint64_t);
+			continue;
+		} else {
+			const uint8_t *ptr8 = (const uint8_t *) ptr;
+			uint64_t val = 0;
+			int i;
+
+			for (i = 0; i < len; i++) {
+				val <<= 8;
+				val |= (uint8_t) *ptr8++;
+			}
+			hval ^= val;
+			break;
+		}
 	}
 
 	return hval;
diff -Nru fio-2.16/crc/sha3.c fio-3.1/crc/sha3.c
--- fio-2.16/crc/sha3.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/crc/sha3.c	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,173 @@
+/*
+ * Cryptographic API.
+ *
+ * SHA-3, as specified in
+ * http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)•
+ * any later version.
+ *
+ */
+#include <string.h>
+#include <inttypes.h>
+
+#include "../os/os.h"
+
+#include "sha3.h"
+
+#define KECCAK_ROUNDS 24
+
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+
+static const uint64_t keccakf_rndc[24] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+static const int keccakf_rotc[24] = {
+	1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+	27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+};
+
+static const int keccakf_piln[24] = {
+	10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+	15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
+};
+
+/* update the state with given number of rounds */
+
+static void keccakf(uint64_t st[25])
+{
+	int i, j, round;
+	uint64_t t, bc[5];
+
+	for (round = 0; round < KECCAK_ROUNDS; round++) {
+
+		/* Theta */
+		for (i = 0; i < 5; i++)
+			bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15]
+				^ st[i + 20];
+
+		for (i = 0; i < 5; i++) {
+			t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
+			for (j = 0; j < 25; j += 5)
+				st[j + i] ^= t;
+		}
+
+		/* Rho Pi */
+		t = st[1];
+		for (i = 0; i < 24; i++) {
+			j = keccakf_piln[i];
+			bc[0] = st[j];
+			st[j] = ROTL64(t, keccakf_rotc[i]);
+			t = bc[0];
+		}
+
+		/* Chi */
+		for (j = 0; j < 25; j += 5) {
+			for (i = 0; i < 5; i++)
+				bc[i] = st[j + i];
+			for (i = 0; i < 5; i++)
+				st[j + i] ^= (~bc[(i + 1) % 5]) &
+					     bc[(i + 2) % 5];
+		}
+
+		/* Iota */
+		st[0] ^= keccakf_rndc[round];
+	}
+}
+
+static void fio_sha3_init(struct fio_sha3_ctx *sctx, unsigned int digest_sz)
+{
+	memset(sctx->st, 0, sizeof(sctx->st));
+	sctx->md_len = digest_sz;
+	sctx->rsiz = 200 - 2 * digest_sz;
+	sctx->rsizw = sctx->rsiz / 8;
+	sctx->partial = 0;
+	memset(sctx->buf, 0, sizeof(sctx->buf));
+}
+
+void fio_sha3_224_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_224_DIGEST_SIZE);
+}
+
+void fio_sha3_256_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_256_DIGEST_SIZE);
+}
+
+void fio_sha3_384_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_384_DIGEST_SIZE);
+}
+
+void fio_sha3_512_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_512_DIGEST_SIZE);
+}
+
+int fio_sha3_update(struct fio_sha3_ctx *sctx, const uint8_t *data,
+		    unsigned int len)
+{
+	unsigned int done;
+	const uint8_t *src;
+
+	done = 0;
+	src = data;
+
+	if ((sctx->partial + len) > (sctx->rsiz - 1)) {
+		if (sctx->partial) {
+			done = -sctx->partial;
+			memcpy(sctx->buf + sctx->partial, data,
+			       done + sctx->rsiz);
+			src = sctx->buf;
+		}
+
+		do {
+			unsigned int i;
+
+			for (i = 0; i < sctx->rsizw; i++)
+				sctx->st[i] ^= ((uint64_t *) src)[i];
+			keccakf(sctx->st);
+
+			done += sctx->rsiz;
+			src = data + done;
+		} while (done + (sctx->rsiz - 1) < len);
+
+		sctx->partial = 0;
+	}
+	memcpy(sctx->buf + sctx->partial, src, len - done);
+	sctx->partial += (len - done);
+
+	return 0;
+}
+
+void fio_sha3_final(struct fio_sha3_ctx *sctx)
+{
+	unsigned int i, inlen = sctx->partial;
+
+	sctx->buf[inlen++] = 0x06;
+	memset(sctx->buf + inlen, 0, sctx->rsiz - inlen);
+	sctx->buf[sctx->rsiz - 1] |= 0x80;
+
+	for (i = 0; i < sctx->rsizw; i++)
+		sctx->st[i] ^= ((uint64_t *) sctx->buf)[i];
+
+	keccakf(sctx->st);
+
+	for (i = 0; i < sctx->rsizw; i++)
+		sctx->st[i] = cpu_to_le64(sctx->st[i]);
+
+	memcpy(sctx->sha, sctx->st, sctx->md_len);
+}
diff -Nru fio-2.16/crc/sha3.h fio-3.1/crc/sha3.h
--- fio-2.16/crc/sha3.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/crc/sha3.h	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,42 @@
+/*
+ * Common values for SHA-3 algorithms
+ */
+#ifndef __CRYPTO_SHA3_H__
+#define __CRYPTO_SHA3_H__
+
+#include <inttypes.h>
+
+#define SHA3_224_DIGEST_SIZE	(224 / 8)
+#define SHA3_224_BLOCK_SIZE	(200 - 2 * SHA3_224_DIGEST_SIZE)
+
+#define SHA3_256_DIGEST_SIZE	(256 / 8)
+#define SHA3_256_BLOCK_SIZE	(200 - 2 * SHA3_256_DIGEST_SIZE)
+
+#define SHA3_384_DIGEST_SIZE	(384 / 8)
+#define SHA3_384_BLOCK_SIZE	(200 - 2 * SHA3_384_DIGEST_SIZE)
+
+#define SHA3_512_DIGEST_SIZE	(512 / 8)
+#define SHA3_512_BLOCK_SIZE	(200 - 2 * SHA3_512_DIGEST_SIZE)
+
+struct fio_sha3_ctx {
+	uint64_t	st[25];
+	unsigned int	md_len;
+	unsigned int	rsiz;
+	unsigned int	rsizw;
+
+	unsigned int	partial;
+	uint8_t		buf[SHA3_224_BLOCK_SIZE];
+
+	uint8_t		*sha;
+};
+
+void fio_sha3_224_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_256_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_384_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_512_init(struct fio_sha3_ctx *sctx);
+
+int fio_sha3_update(struct fio_sha3_ctx *sctx, const uint8_t *data,
+		    unsigned int len);
+void fio_sha3_final(struct fio_sha3_ctx *sctx);
+
+#endif
diff -Nru fio-2.16/crc/test.c fio-3.1/crc/test.c
--- fio-2.16/crc/test.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/crc/test.c	2017-09-28 10:23:20.000000000 +0000
@@ -16,6 +16,7 @@
 #include "../crc/sha1.h"
 #include "../crc/sha256.h"
 #include "../crc/sha512.h"
+#include "../crc/sha3.h"
 #include "../crc/xxhash.h"
 #include "../crc/murmur3.h"
 #include "../crc/fnv.h"
@@ -47,6 +48,10 @@
 	T_MURMUR3	= 1U << 10,
 	T_JHASH		= 1U << 11,
 	T_FNV		= 1U << 12,
+	T_SHA3_224	= 1U << 13,
+	T_SHA3_256	= 1U << 14,
+	T_SHA3_384	= 1U << 15,
+	T_SHA3_512	= 1U << 16,
 };
 
 static void t_md5(struct test_type *t, void *buf, size_t size)
@@ -143,6 +148,62 @@
 		fio_sha512_update(&ctx, buf, size);
 }
 
+static void t_sha3_224(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_224_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_224_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_256(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_256_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_256_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_384(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_384_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_384_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_512(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_512_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_512_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
 static void t_murmur3(struct test_type *t, void *buf, size_t size)
 {
 	int i;
@@ -247,6 +308,26 @@
 		.fn = t_fnv,
 	},
 	{
+		.name = "sha3-224",
+		.mask = T_SHA3_224,
+		.fn = t_sha3_224,
+	},
+	{
+		.name = "sha3-256",
+		.mask = T_SHA3_256,
+		.fn = t_sha3_256,
+	},
+	{
+		.name = "sha3-384",
+		.mask = T_SHA3_384,
+		.fn = t_sha3_384,
+	},
+	{
+		.name = "sha3-512",
+		.mask = T_SHA3_512,
+		.fn = t_sha3_512,
+	},
+	{
 		.name = NULL,
 	},
 };
@@ -291,6 +372,7 @@
 	int i, first = 1;
 	void *buf;
 
+	crc32c_arm64_probe();
 	crc32c_intel_probe();
 
 	if (!type)
@@ -310,7 +392,7 @@
 	fill_random_buf(&state, buf, CHUNK);
 
 	for (i = 0; t[i].name; i++) {
-		struct timeval tv;
+		struct timespec ts;
 		double mb_sec;
 		uint64_t usec;
 		char pre[3];
@@ -327,9 +409,9 @@
 			t[i].fn(&t[i], buf, CHUNK);
 		}
 
-		fio_gettime(&tv, NULL);
+		fio_gettime(&ts, NULL);
 		t[i].fn(&t[i], buf, CHUNK);
-		usec = utime_since_now(&tv);
+		usec = utime_since_now(&ts);
 
 		if (usec) {
 			mb_sec = (double) mb / (double) usec;
@@ -338,9 +420,9 @@
 				sprintf(pre, "\t");
 			else
 				sprintf(pre, "\t\t");
-			printf("%s:%s%8.2f MB/sec\n", t[i].name, pre, mb_sec);
+			printf("%s:%s%8.2f MiB/sec\n", t[i].name, pre, mb_sec);
 		} else
-			printf("%s:inf MB/sec\n", t[i].name);
+			printf("%s:inf MiB/sec\n", t[i].name);
 		first = 0;
 	}
 
diff -Nru fio-2.16/debian/changelog fio-3.1/debian/changelog
--- fio-2.16/debian/changelog	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/changelog	2017-10-24 07:47:45.000000000 +0000
@@ -1,3 +1,27 @@
+fio (3.1-1) unstable; urgency=medium
+
+  * Imported upstream versions 3.0 and 3.1 (Closes: #869686).
+  * Adapted my mail address to new company domain.
+  * patches:
+    - Refreshed.
+    - Dropped patch spelling-errors. Applied upstream.
+    - Changed author in all of my patches to new mail address.
+  * control: Bumped standards version to 4.1.1.0.
+    * watch: Changed to pgpsigurlmangle as recommended by policy 4.1.0.0,
+      section 4.11.
+
+ -- Martin Steigerwald <martin.steigerwald@proact.de>  Tue, 24 Oct 2017 09:47:45 +0200
+
+fio (2.17-1) unstable; urgency=medium
+
+  * Imported upstream version 2.17.
+  * debian/patches: Refreshed.
+  * debian/copyright: Updated, added some files, all GPL-2.
+  * debian/docs: Added fio_latency2csv.py, example systemd fio.service file, and
+    logparser histogram scripts from tools/hist.
+
+ -- Martin Steigerwald <martin.steigerwald@teamix.de>  Mon, 23 Jan 2017 11:10:42 +0100
+
 fio (2.16-1) unstable; urgency=medium
 
   * Imported upstream version 2.16.
diff -Nru fio-2.16/debian/control fio-3.1/debian/control
--- fio-2.16/debian/control	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/control	2017-10-24 07:47:45.000000000 +0000
@@ -1,9 +1,9 @@
 Source: fio
 Section: utils
 Priority: optional
-Maintainer: Martin Steigerwald <martin.steigerwald@teamix.de>
+Maintainer: Martin Steigerwald <martin.steigerwald@proact.de>
 Build-Depends: debhelper (>= 9), dpkg-dev (>= 1.16.1~), libaio-dev, zlib1g-dev, librdmacm-dev, libibverbs-dev, librbd-dev, libgtk2.0-dev, libcairo2-dev
-Standards-Version: 3.9.8
+Standards-Version: 4.1.1.0
 Homepage: https://github.com/axboe/fio
 Vcs-Git: https://anonscm.debian.org/git/collab-maint/fio.git
 Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/fio.git
diff -Nru fio-2.16/debian/copyright fio-3.1/debian/copyright
--- fio-2.16/debian/copyright	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/copyright	2017-10-24 07:47:45.000000000 +0000
@@ -5,11 +5,12 @@
 
 Files: *
 Copyright: 2005 Jens Axboe <axboe@suse.de>
-           2006-2012 Jens Axboe <axboe@kernel.dk>
+           2006-2017 Jens Axboe <axboe@kernel.dk>
 License: GPL-2
 
 Files: debian/*
-Copyright: 2009-2012 Martin Steigerwald <ms@teamix.de>
+Copyright: 2009-2017 Martin Steigerwald <ms@teamix.de
+           2017 Martin Steigerwald <martin.steigerwald@proact.de>
 Comment:
  This package was debianized by Martin Steigerwald <ms@teamix.de> on
  Tue, 19 May 2009 15:04:02 +0200.
@@ -64,7 +65,7 @@
 License: GPL-2+
 
 Files: crc/crc32c-intel.c
-Copyright: Based on a posting to lkml by Austin Zhang <austin.zhang@intel.com>
+Copyright: Based on a posting to LKML by Austin Zhang <austin.zhang@intel.com>
 License: GPL-2+
 
 Files: crc/md5.c
@@ -91,11 +92,27 @@
 Copyright: 2012-2014 Yann Collet
 License: BSD-2-clause
 
+Files: engines/dev-dax.c
+Copyright: 2016 Intel Corp
+License: GPL-2
+
 Files: engines/fusion-aw.c
 Copyright: 2013 Fusion-io, Inc.
            Santhosh Kumar Koundinya <skoundinya@fusionio.com>
 License: GPL-2
 
+Files: engines/pmemblk.c
+Copyright: 2016 Hewlett Packard Enterprise Development LP
+License: GPL-2
+
+Files: examples/gfapi.fio
+Copyright: Originally authored by Castor Fu
+License: GPL-2
+
+Files: examples/jesd219.fio
+Copyright: Based on a posting from Jeff Furlong <jeff.furlong@hgst.com>
+License: GPL-2
+
 Files: exp/expression-parser.l exp/expression-parser.y exp/test-expression-parser.c
 Copyright: 2014, Stephen M. Cameron
 License: GPL-2
@@ -142,7 +159,7 @@
 License: GPL-2+
 
 Files: oslib/libmtd_common.h
-Copyright: 2007,2008 Artem Bityutskiy
+Copyright: 2007, 2008 Artem Bityutskiy
 License: GPL-2+
 
 Files: oslib/libmtd_legacy.c
@@ -158,6 +175,10 @@
            2006 KaiGai Kohei <kaigai@ak.jp.nec.com>
 License: GPL-2+
 
+Files: t/read-to-pipe-async.c
+Copyright: 2016 Jens Axboe
+License: GPL-2+
+
 Files: tools/fio_generate_plots.1
 Copyright: Written by Martin Steigerwald <ms@teamix.de>
 License: GPL-2
@@ -167,6 +188,10 @@
            2016 Ben England <bengland@redhat.com>
 License: GPL-2
 
+Files: tools/hist/*
+Copyright: Karl Cronburg
+License: GPL-2
+
 Files: tools/genfio tools/plot/fio2gnuplot tools/plot/fio2gnuplot.1 tools/plot/fio2gnuplot.manpage tools/plot/graph2D.gpm tools/plot/graph3D.gpm tools/plot/math.gpm
 Copyright: 2013 eNovance SAS <licensing@enovance.com>
            Erwan Velu <erwan@enovance.com>
diff -Nru fio-2.16/debian/docs fio-3.1/debian/docs
--- fio-2.16/debian/docs	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/docs	2017-10-24 07:47:45.000000000 +0000
@@ -3,4 +3,9 @@
 MORAL-LICENSE
 REPORTING-BUGS
 examples/
+tools/hist/fiologparser_hist.py
+tools/hist/fiologparser_hist.py.1
+tools/hist/half-bins.py
+tools/fio_latency2csv.py
+tools/fio.service
 tools/fiologparser.py
diff -Nru fio-2.16/debian/patches/configure-no-configlog fio-3.1/debian/patches/configure-no-configlog
--- fio-2.16/debian/patches/configure-no-configlog	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/patches/configure-no-configlog	2017-10-24 07:47:45.000000000 +0000
@@ -1,9 +1,9 @@
 Description: Remove config.log to fix dpkg-source error about changed files.
-Author: Martin Steigerwald <ms@teamix.de>
+Author: Martin Steigerwald <martin.steigerwald@proact.de>
 
 --- a/configure
 +++ b/configure
-@@ -1973,3 +1973,5 @@
+@@ -2276,3 +2276,5 @@
  include \$(SRCDIR)/Makefile
  EOF
  fi
diff -Nru fio-2.16/debian/patches/fio2gnuplot-manpage fio-3.1/debian/patches/fio2gnuplot-manpage
--- fio-2.16/debian/patches/fio2gnuplot-manpage	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/patches/fio2gnuplot-manpage	2017-10-24 07:47:45.000000000 +0000
@@ -1,11 +1,11 @@
 Description: Fix tag lintian manpage-section-mismatch.
-Author: Martin Steigerwald <ms@teamix.de>
+Author: Martin Steigerwald <martin.steigerwald@proact.de>
 
 --- a/tools/plot/fio2gnuplot.1
 +++ b/tools/plot/fio2gnuplot.1
 @@ -1,5 +1,5 @@
  .\" Text automatically generated by txt2man
--.TH fio2gnuplot  "07 août 2013" "" ""
+-.TH fio2gnuplot 1 "August 2013"
 +.TH fio2gnuplot 1 "07 August 2013" "User Manual"
  .SH NAME
  \fBfio2gnuplot \fP- Render fio's output files with gnuplot
diff -Nru fio-2.16/debian/patches/fix-ftbfs-with-libmtd.h fio-3.1/debian/patches/fix-ftbfs-with-libmtd.h
--- fio-2.16/debian/patches/fix-ftbfs-with-libmtd.h	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/patches/fix-ftbfs-with-libmtd.h	2017-10-24 07:47:45.000000000 +0000
@@ -1,5 +1,5 @@
 Description: fix FTBFS libmtd.h:288:8: error: unknown type name 'uint8_t' (Debian Bug 815735)
-Author: Martin Steigerwald <martin.steigerwald@teamix.de>
+Author: Martin Steigerwald <martin.steigerwald@proact.de>
 
 --- a/oslib/libmtd.h
 +++ b/oslib/libmtd.h
diff -Nru fio-2.16/debian/patches/makefile-hardening fio-3.1/debian/patches/makefile-hardening
--- fio-2.16/debian/patches/makefile-hardening	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/patches/makefile-hardening	2017-10-24 07:47:45.000000000 +0000
@@ -1,5 +1,5 @@
 Description: Keep hardening build flags.
-Author: Martin Steigerwald <martin.steigerwald@teamix.de>
+Author: Martin Steigerwald <martin.steigerwald@proact.de>
 
 --- a/Makefile
 +++ b/Makefile
diff -Nru fio-2.16/debian/patches/makefile-manpagepath fio-3.1/debian/patches/makefile-manpagepath
--- fio-2.16/debian/patches/makefile-manpagepath	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/patches/makefile-manpagepath	2017-10-24 07:47:45.000000000 +0000
@@ -1,9 +1,9 @@
 Description: Adapt manpage path to Debian.
-Author: Martin Steigerwald <ms@teamix.de>
+Author: Martin Steigerwald <martin.steigerwald@proact.de>
 
 --- a/Makefile
 +++ b/Makefile
-@@ -298,7 +298,7 @@
+@@ -306,7 +306,7 @@
  mandir = /usr/share/man
  sharedir = /usr/share/fio
  else
diff -Nru fio-2.16/debian/patches/reproducible-build fio-3.1/debian/patches/reproducible-build
--- fio-2.16/debian/patches/reproducible-build	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/patches/reproducible-build	2017-10-24 07:47:45.000000000 +0000
@@ -3,7 +3,7 @@
 
 --- a/Makefile
 +++ b/Makefile
-@@ -185,7 +185,7 @@
+@@ -188,7 +188,7 @@
    CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format -static
  endif
  
diff -Nru fio-2.16/debian/patches/series fio-3.1/debian/patches/series
--- fio-2.16/debian/patches/series	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/patches/series	2017-10-24 07:47:45.000000000 +0000
@@ -2,5 +2,4 @@
 fio2gnuplot-manpage
 configure-no-configlog
 fix-ftbfs-with-libmtd.h
-spelling-errors
 reproducible-build
diff -Nru fio-2.16/debian/patches/spelling-errors fio-3.1/debian/patches/spelling-errors
--- fio-2.16/debian/patches/spelling-errors	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/patches/spelling-errors	1970-01-01 00:00:00.000000000 +0000
@@ -1,79 +0,0 @@
-Description: Fix some spelling errors in fio binary, fio manpage and HOWTO.
-Author: Martin Steigerwald <martin.steigerwald@teamix.de>
-
---- a/HOWTO
-+++ b/HOWTO
-@@ -685,13 +685,13 @@
- 				the next. Multiple files can still be
- 				open depending on 'openfiles'.
- 
--			zipf	Use a zipfian distribution to decide what file
-+			zipf	Use a Zipfian distribution to decide what file
- 				to access.
- 
--			pareto	Use a pareto distribution to decide what file
-+			pareto	Use a Pareto distribution to decide what file
- 				to access.
- 
--			gauss	Use a gaussian (normal) distribution to decide
-+			gauss	Use a Gaussian (normal) distribution to decide
- 				what file to access.
- 
- 		For random, roundrobin, and sequential, a postfix can be
-@@ -998,7 +998,7 @@
- 		random		Uniform random distribution
- 		zipf		Zipf distribution
- 		pareto		Pareto distribution
--		gauss		Normal (gaussian) distribution
-+		gauss		Normal (Gaussian) distribution
- 		zoned		Zoned random distribution
- 
- 		When using a zipf or pareto distribution, an input value
-@@ -1696,7 +1696,7 @@
- 
- log_hist_msec=int Same as log_avg_msec, but logs entries for completion
- 		latency histograms. Computing latency percentiles from averages of
--		intervals using log_avg_msec is innacurate. Setting this option makes
-+		intervals using log_avg_msec is inacurate. Setting this option makes
- 		fio log histogram entries over the specified period of time, reducing
- 		log sizes for high IOPS devices while retaining percentile accuracy.
- 		See log_hist_coarseness as well. Defaults to 0, meaning histogram
---- a/fio.1
-+++ b/fio.1
-@@ -592,13 +592,13 @@
- Do each file in the set sequentially.
- .TP
- .B zipf
--Use a zipfian distribution to decide what file to access.
-+Use a Zipfian distribution to decide what file to access.
- .TP
- .B pareto
--Use a pareto distribution to decide what file to access.
-+Use a Pareto distribution to decide what file to access.
- .TP
- .B gauss
--Use a gaussian (normal) distribution to decide what file to access.
-+Use a Gaussian (normal) distribution to decide what file to access.
- .RE
- .P
- For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be
-@@ -1575,7 +1575,7 @@
- .BI log_hist_msec \fR=\fPint
- Same as \fBlog_avg_msec\fR, but logs entries for completion latency histograms.
- Computing latency percentiles from averages of intervals using \fBlog_avg_msec\fR
--is innacurate. Setting this option makes fio log histogram entries over the
-+is inacurate. Setting this option makes fio log histogram entries over the
- specified period of time, reducing log sizes for high IOPS devices while
- retaining percentile accuracy. See \fBlog_hist_coarseness\fR as well. Defaults
- to 0, meaning histogram logging is disabled.
---- a/options.c
-+++ b/options.c
-@@ -2234,7 +2234,7 @@
- 			  },
- 			  { .ival = "gauss",
- 			    .oval = FIO_FSERVICE_GAUSS,
--			    .help = "Normal (gaussian) distribution",
-+			    .help = "Normal (Gaussian) distribution",
- 			  },
- 			  { .ival = "roundrobin",
- 			    .oval = FIO_FSERVICE_RR,
diff -Nru fio-2.16/debian/watch fio-3.1/debian/watch
--- fio-2.16/debian/watch	2016-12-23 16:09:43.000000000 +0000
+++ fio-3.1/debian/watch	2017-10-24 07:47:45.000000000 +0000
@@ -1,3 +1,3 @@
 version=4
-opts="pgpmode=next" http://brick.kernel.dk/snaps/fio-(\d.*)\.tar\.gz
-opts="pgpmode=previous" http://brick.kernel.dk/snaps/fio-(\d.*)\.tar\.gz\.asc
+opts=pgpmode=mangle
+opts=pgpsigurlmangle=s/$/.asc/ http://brick.kernel.dk/snaps/fio-(\d.*)\.tar\.gz
diff -Nru fio-2.16/diskutil.c fio-3.1/diskutil.c
--- fio-2.16/diskutil.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/diskutil.c	2017-09-28 10:23:20.000000000 +0000
@@ -3,6 +3,7 @@
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/sysmacros.h>
 #include <dirent.h>
 #include <libgen.h>
 #include <math.h>
@@ -18,8 +19,6 @@
 
 static struct fio_mutex *disk_util_mutex;
 
-FLIST_HEAD(disk_list);
-
 static struct disk_util *__init_per_file_disk_util(struct thread_data *td,
 		int majdev, int mindev, char *path);
 
@@ -37,6 +36,7 @@
 	}
 
 	fio_mutex_remove(du->lock);
+	free(du->sysfs_root);
 	sfree(du);
 }
 
@@ -85,7 +85,7 @@
 static void update_io_tick_disk(struct disk_util *du)
 {
 	struct disk_util_stat __dus, *dus, *ldus;
-	struct timeval t;
+	struct timespec t;
 
 	if (!du->users)
 		return;
@@ -305,7 +305,7 @@
 		return NULL;
 	}
 	strncpy((char *) du->dus.name, basename(path), FIO_DU_NAME_SZ - 1);
-	du->sysfs_root = path;
+	du->sysfs_root = strdup(path);
 	du->major = majdev;
 	du->minor = mindev;
 	INIT_FLIST_HEAD(&du->slavelist);
@@ -364,7 +364,7 @@
 		return 0;
 
 	while ((dir = readdir(D)) != NULL) {
-		char full_path[256];
+		char full_path[257];
 
 		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
 			continue;
@@ -430,9 +430,6 @@
 		sprintf(path, "%s", tmp);
 	}
 
-	if (td->o.ioscheduler && !td->sysfs_root)
-		td->sysfs_root = strdup(path);
-
 	return disk_util_add(td, majdev, mindev, path);
 }
 
@@ -451,12 +448,8 @@
 			mindev);
 
 	du = disk_util_exists(majdev, mindev);
-	if (du) {
-		if (td->o.ioscheduler && !td->sysfs_root)
-			td->sysfs_root = strdup(du->sysfs_root);
-
+	if (du)
 		return du;
-	}
 
 	/*
 	 * for an fs without a device, we will repeatedly stat through
diff -Nru fio-2.16/diskutil.h fio-3.1/diskutil.h
--- fio-2.16/diskutil.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/diskutil.h	2017-09-28 10:23:20.000000000 +0000
@@ -46,7 +46,6 @@
 	 */
 	struct flist_head slavelist;
 
-	char *name;
 	char *sysfs_root;
 	char path[PATH_MAX];
 	int major, minor;
@@ -65,7 +64,7 @@
 	 */
 	struct flist_head slaves;
 
-	struct timeval time;
+	struct timespec time;
 
 	struct fio_mutex *lock;
 	unsigned long users;
@@ -115,6 +114,7 @@
 extern void setup_disk_util(void);
 extern void disk_util_prune_entries(void);
 #else
+/* keep this as a function to avoid a warning in handle_du() */
 static inline void print_disk_util(struct disk_util_stat *du,
 				   struct disk_util_agg *agg, int terse,
 				   struct buf_output *out)
diff -Nru fio-2.16/doc/conf.py fio-3.1/doc/conf.py
--- fio-2.16/doc/conf.py	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/doc/conf.py	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# fio documentation build configuration file, created by
+# sphinx-quickstart on Mon Nov 14 13:56:30 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'fio'
+copyright = '2017, Jens Axboe <axboe@kernel.dk>'
+author = 'Jens Axboe <axboe@kernel.dk>'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+
+# The short X.Y version.
+# version = '1'
+# The full version, including alpha/beta/rc tags.
+# release = '1'
+
+def fio_version():
+
+	from os.path import exists, dirname, join
+	wsroot = dirname(dirname(__file__))
+	version_file = join(wsroot, "FIO-VERSION-FILE")
+	if not exists(version_file):
+		version_gen = join(wsroot, "FIO-VERSION-GEN")
+		from subprocess import call
+		rc = call(version_gen, shell=True, cwd=wsroot)
+		if rc:
+			print("Couldn't generate version file. rc=%r" % rc)
+			return "Unknown", "Unknown"
+
+	vsl = open(version_file).read().strip().split('-')
+	version = vsl[1]
+	release = '-'.join(vsl[1:])
+	return version, release
+
+version, release = fio_version()
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['output', 'Thumbs.db', '.DS_Store', 'fio_examples.rst']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.
+# "<project> v<release> documentation" by default.
+#
+# html_title = 'fio v1'
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+# html_logo = None
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If not None, a 'Last updated on:' timestamp is inserted at every page
+# bottom, using the given strftime format.
+# The empty string is equivalent to '%b %d, %Y'.
+#
+# html_last_updated_fmt = None
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+# html_domain_indices = True
+
+# If false, no index is generated.
+#
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'fiodoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+     # The paper size ('letterpaper' or 'a4paper').
+     #
+     # 'papersize': 'letterpaper',
+
+     # The font size ('10pt', '11pt' or '12pt').
+     #
+     # 'pointsize': '10pt',
+
+     # Additional stuff for the LaTeX preamble.
+     #
+     # 'preamble': '',
+
+     # Latex figure (float) alignment
+     #
+     # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'fio.tex', 'fio Documentation',
+     'a', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# It false, will not define \strong, \code, 	itleref, \crossref ... but only
+# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
+# packages.
+#
+# latex_keep_old_macro_names = True
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('fio_man', 'fio', 'flexible I/O tester',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'fio', 'fio Documentation',
+     author, 'fio', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False
diff -Nru fio-2.16/doc/fio_doc.rst fio-3.1/doc/fio_doc.rst
--- fio-2.16/doc/fio_doc.rst	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/doc/fio_doc.rst	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,51 @@
+fio - Flexible I/O tester rev. |version|
+========================================
+
+
+.. include:: ../README
+
+
+.. include:: ../HOWTO
+
+
+
+Examples
+========
+
+.. include:: fio_examples.rst
+
+
+
+TODO
+====
+
+
+GFIO TODO
+---------
+
+.. include:: ../GFIO-TODO
+
+
+Server TODO
+-----------
+
+.. include:: ../SERVER-TODO
+
+
+Steady State TODO
+-----------------
+
+.. include:: ../STEADYSTATE-TODO
+
+
+
+Moral License
+=============
+
+.. include:: ../MORAL-LICENSE
+
+
+License
+=======
+
+.. literalinclude:: ../COPYING
diff -Nru fio-2.16/doc/fio_examples.rst fio-3.1/doc/fio_examples.rst
--- fio-2.16/doc/fio_examples.rst	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/doc/fio_examples.rst	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,72 @@
+Some job file examples.
+
+
+Poisson request flow
+--------------------
+
+.. only:: builder_html
+
+:download:`Download poisson-rate-submission.fio <../examples/poisson-rate-submission.fio>`
+
+.. literalinclude:: ../examples/poisson-rate-submission.fio
+	:language: ini
+
+Latency profile
+---------------
+
+.. only:: builder_html
+
+:download:`Download latency-profile.fio <../examples/latency-profile.fio>`
+
+.. literalinclude:: ../examples/latency-profile.fio
+	:language: ini
+
+Read 4 files with aio at different depths
+-----------------------------------------
+
+.. only:: builder_html
+
+:download:`Download aio-read.fio <../examples/aio-read.fio>`
+
+.. literalinclude:: ../examples/aio-read.fio
+	:language: ini
+
+Read backwards in a file
+------------------------
+
+.. only:: builder_html
+
+:download:`Download backwards-read.fio <../examples/backwards-read.fio>`
+
+.. literalinclude:: ../examples/backwards-read.fio
+	:language: ini
+
+Basic verification
+------------------
+
+.. only:: builder_html
+
+:download:`Download basic-verify.fio <../examples/basic-verify.fio>`
+
+.. literalinclude:: ../examples/basic-verify.fio
+	:language: ini
+
+Fixed rate submission
+---------------------
+
+.. only:: builder_html
+
+:download:`Download fixed-rate-submission.fio <../examples/fixed-rate-submission.fio>`
+
+.. literalinclude:: ../examples/fixed-rate-submission.fio
+	:language: ini
+
+Butterfly seek pattern
+-----------------------
+
+.. only:: builder_html
+
+:download:`Download butterfly.fio <../examples/butterfly.fio>`
+
+.. literalinclude:: ../examples/butterfly.fio
+	:language: ini
diff -Nru fio-2.16/doc/fio_man.rst fio-3.1/doc/fio_man.rst
--- fio-2.16/doc/fio_man.rst	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/doc/fio_man.rst	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,12 @@
+:orphan:
+
+Fio Manpage
+===========
+
+(rev. |release|)
+
+
+.. include:: ../README
+
+
+.. include:: ../HOWTO
diff -Nru fio-2.16/doc/index.rst fio-3.1/doc/index.rst
--- fio-2.16/doc/index.rst	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/doc/index.rst	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,25 @@
+.. FIO documentation master file, created by
+   sphinx-quickstart on Thu Mar 20 16:24:25 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to FIO's documentation!
+===============================
+
+**Version:** |release|
+
+Contents:
+
+.. toctree::
+   :maxdepth: 3
+   :numbered:
+
+	fio - Flexible I/O tester |version| <fio_doc>
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
+
diff -Nru fio-2.16/doc/make.bat fio-3.1/doc/make.bat
--- fio-2.16/doc/make.bat	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/doc/make.bat	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,281 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  epub3      to make an epub3
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  xml        to make Docutils-native XML files
+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	echo.  coverage   to run coverage check of the documentation if enabled
+	echo.  dummy      to check syntax errors of document sources
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+
+REM Check if sphinx-build is available and fallback to Python version if any
+%SPHINXBUILD% 1>NUL 2>NUL
+if errorlevel 9009 goto sphinx_python
+goto sphinx_ok
+
+:sphinx_python
+
+set SPHINXBUILD=python -m sphinx.__init__
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+:sphinx_ok
+
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\fio.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\fio.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "epub3" (
+	%SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdf" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdfja" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf-ja
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+if "%1" == "coverage" (
+	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of coverage in the sources finished, look at the ^
+results in %BUILDDIR%/coverage/python.txt.
+	goto end
+)
+
+if "%1" == "xml" (
+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The XML files are in %BUILDDIR%/xml.
+	goto end
+)
+
+if "%1" == "pseudoxml" (
+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+	goto end
+)
+
+if "%1" == "dummy" (
+	%SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. Dummy builder generates no files.
+	goto end
+)
+
+:end
diff -Nru fio-2.16/doc/Makefile fio-3.1/doc/Makefile
--- fio-2.16/doc/Makefile	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/doc/Makefile	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,225 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = output
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  epub3      to make an epub3"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+	@echo "  dummy      to check syntax errors of document sources"
+
+.PHONY: clean
+clean:
+	rm -rf $(BUILDDIR)/*
+
+.PHONY: html
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+.PHONY: dirhtml
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+.PHONY: singlehtml
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+.PHONY: pickle
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+.PHONY: json
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+.PHONY: htmlhelp
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+.PHONY: qthelp
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fio.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fio.qhc"
+
+.PHONY: applehelp
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+.PHONY: devhelp
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/fio"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/fio"
+	@echo "# devhelp"
+
+.PHONY: epub
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+.PHONY: epub3
+epub3:
+	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
+	@echo
+	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
+
+.PHONY: latex
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+.PHONY: latexpdf
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: latexpdfja
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: text
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+.PHONY: man
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+.PHONY: texinfo
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+.PHONY: info
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+.PHONY: gettext
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+.PHONY: changes
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+.PHONY: linkcheck
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+.PHONY: doctest
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+.PHONY: coverage
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+.PHONY: xml
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+.PHONY: pseudoxml
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+
+.PHONY: dummy
+dummy:
+	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
+	@echo
+	@echo "Build finished. Dummy builder generates no files."
diff -Nru fio-2.16/engines/binject.c fio-3.1/engines/binject.c
--- fio-2.16/engines/binject.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/binject.c	2017-09-28 10:23:20.000000000 +0000
@@ -59,11 +59,12 @@
 	return 0;
 }
 
-static unsigned int binject_read_commands(struct thread_data *td, void *p,
+static unsigned int binject_read_commands(struct thread_data *td, void *buf,
 					  int left, int *err)
 {
 	struct fio_file *f;
 	int i, ret, events;
+	char *p = buf;
 
 one_more:
 	events = 0;
@@ -351,7 +352,7 @@
 	if (ret)
 		return 1;
 
-	if (f->filetype != FIO_TYPE_BD) {
+	if (f->filetype != FIO_TYPE_BLOCK) {
 		log_err("fio: binject only works with block devices\n");
 		goto err_close;
 	}
diff -Nru fio-2.16/engines/cpu.c fio-3.1/engines/cpu.c
--- fio-2.16/engines/cpu.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/cpu.c	2017-09-28 10:23:20.000000000 +0000
@@ -22,7 +22,7 @@
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct cpu_options, cpuload),
 		.help	= "Use this percentage of CPU",
-		.category = FIO_OPT_C_GENERAL,
+		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
@@ -34,7 +34,7 @@
 		.def	= "50000",
 		.parent = "cpuload",
 		.hide	= 1,
-		.category = FIO_OPT_C_GENERAL,
+		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
@@ -44,7 +44,7 @@
 		.off1	= offsetof(struct cpu_options, exit_io_done),
 		.help	= "Exit when IO threads finish",
 		.def	= "0",
-		.category = FIO_OPT_C_GENERAL,
+		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
diff -Nru fio-2.16/engines/dev-dax.c fio-3.1/engines/dev-dax.c
--- fio-2.16/engines/dev-dax.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/dev-dax.c	2017-09-28 10:23:20.000000000 +0000
@@ -51,14 +51,14 @@
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
-#include <dlfcn.h>
 #include <libgen.h>
+#include <libpmem.h>
 
 #include "../fio.h"
 #include "../verify.h"
 
 /*
- * Limits us to 1GB of mapped files in total to model after
+ * Limits us to 1GiB of mapped files in total to model after
  * mmap engine behavior
  */
 #define MMAP_TOTAL_SZ	(1 * 1024 * 1024 * 1024UL)
@@ -69,8 +69,6 @@
 	off_t devdax_off;
 };
 
-static void * (*pmem_memcpy_persist)(void *dest, const void *src, size_t len);
-
 static int fio_devdax_file(struct thread_data *td, struct fio_file *f,
 			   size_t length, off_t off)
 {
@@ -108,7 +106,7 @@
 	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
 
 	if (io_u->buflen > f->real_file_size) {
-		log_err("fio: bs too big for dev-dax engine\n");
+		log_err("dev-dax: bs too big for dev-dax engine\n");
 		return EIO;
 	}
 
@@ -212,29 +210,11 @@
 static int fio_devdax_init(struct thread_data *td)
 {
 	struct thread_options *o = &td->o;
-	const char *path;
-	void *dl;
 
 	if ((o->rw_min_bs & page_mask) &&
 	    (o->fsync_blocks || o->fdatasync_blocks)) {
-		log_err("fio: mmap options dictate a minimum block size of "
-			"%llu bytes\n", (unsigned long long) page_size);
-		return 1;
-	}
-
-	path = getenv("FIO_PMEM_LIB");
-	if (!path)
-		path = "libpmem.so";
-
-	dl = dlopen(path, RTLD_NOW | RTLD_NODELETE);
-	if (!dl) {
-		log_err("fio: unable to open libpmem: %s\n", dlerror());
-		return 1;
-	}
-
-	pmem_memcpy_persist = dlsym(dl, "pmem_memcpy_persist");
-	if (!pmem_memcpy_persist) {
-		log_err("fio: unable to load libpmem: %s\n", dlerror());
+		log_err("dev-dax: mmap options dictate a minimum block size of %llu bytes\n",
+			(unsigned long long) page_size);
 		return 1;
 	}
 
@@ -292,8 +272,8 @@
 
 	rc = stat(f->file_name, &st);
 	if (rc < 0) {
-		log_err("%s: failed to stat file %s: %d\n",
-			td->o.name, f->file_name, errno);
+		log_err("%s: failed to stat file %s (%s)\n",
+			td->o.name, f->file_name, strerror(errno));
 		return -errno;
 	}
 
@@ -302,8 +282,8 @@
 
 	rpath = realpath(spath, npath);
 	if (!rpath) {
-		log_err("%s: realpath on %s failed: %d\n",
-			td->o.name, spath, errno);
+		log_err("%s: realpath on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
 		return -errno;
 	}
 
@@ -318,15 +298,15 @@
 
 	sfile = fopen(spath, "r");
 	if (!sfile) {
-		log_err("%s: fopen on %s failed: %d\n",
-			td->o.name, spath, errno);
+		log_err("%s: fopen on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
 		return 1;
 	}
 
 	rc = fscanf(sfile, "%lu", &size);
 	if (rc < 0) {
-		log_err("%s: fscanf on %s failed: %d\n",
-			td->o.name, spath, errno);
+		log_err("%s: fscanf on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
 		return 1;
 	}
 
diff -Nru fio-2.16/engines/e4defrag.c fio-3.1/engines/e4defrag.c
--- fio-2.16/engines/e4defrag.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/e4defrag.c	2017-09-28 10:23:20.000000000 +0000
@@ -95,7 +95,7 @@
 	ed->donor_fd = open(donor_name, O_CREAT|O_WRONLY, 0644);
 	if (ed->donor_fd < 0) {
 		td_verror(td, errno, "io_queue_init");
-		log_err("Can't open donor file %s err:%d", donor_name, ed->donor_fd);
+		log_err("Can't open donor file %s err:%d\n", donor_name, ed->donor_fd);
 		free(ed);
 		return 1;
 	}
@@ -172,8 +172,13 @@
 		len = io_u->xfer_buflen;
 
 	if (len != io_u->xfer_buflen) {
-		io_u->resid = io_u->xfer_buflen - len;
-		io_u->error = 0;
+		if (len) {
+			io_u->resid = io_u->xfer_buflen - len;
+			io_u->error = 0;
+		} else {
+			/* access beyond i_size */
+			io_u->error = EINVAL;
+		}
 	}
 	if (ret)
 		io_u->error = errno;
diff -Nru fio-2.16/engines/ftruncate.c fio-3.1/engines/ftruncate.c
--- fio-2.16/engines/ftruncate.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/engines/ftruncate.c	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,56 @@
+/*
+ * ftruncate: ioengine for git://git.kernel.dk/fio.git
+ *
+ * IO engine that does regular truncates to simulate data transfer
+ * as fio ioengine.
+ * DDIR_WRITE does ftruncate
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+
+#include "../fio.h"
+#include "../filehash.h"
+
+static int fio_ftruncate_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	int ret;
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir != DDIR_WRITE) {
+		io_u->error = EINVAL;
+		return FIO_Q_COMPLETED;
+	}
+	ret = ftruncate(f->fd, io_u->offset);
+
+	if (ret)
+		io_u->error = errno;
+
+	return FIO_Q_COMPLETED;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "ftruncate",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_ftruncate_queue,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_FAKEIO
+};
+
+static void fio_init fio_syncio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_syncio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.16/engines/glusterfs_async.c fio-3.1/engines/glusterfs_async.c
--- fio-2.16/engines/glusterfs_async.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/glusterfs_async.c	2017-09-28 10:23:20.000000000 +0000
@@ -92,7 +92,7 @@
 	struct io_u *io_u = data;
 	struct fio_gf_iou *iou = io_u->engine_data;
 
-	dprint(FD_IO, "%s ret %lu\n", __FUNCTION__, ret);
+	dprint(FD_IO, "%s ret %zd\n", __FUNCTION__, ret);
 	iou->io_complete = 1;
 }
 
diff -Nru fio-2.16/engines/glusterfs.c fio-3.1/engines/glusterfs.c
--- fio-2.16/engines/glusterfs.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/glusterfs.c	2017-09-28 10:23:20.000000000 +0000
@@ -165,11 +165,11 @@
 	if (td_read(td)) {
 		if (glfs_lstat(g->fs, f->file_name, &sb)
 		    || sb.st_size < f->real_file_size) {
-			dprint(FD_FILE, "fio extend file %s from %ld to %ld\n",
-			       f->file_name, sb.st_size, f->real_file_size);
+			dprint(FD_FILE, "fio extend file %s from %jd to %" PRIu64 "\n",
+			       f->file_name, (intmax_t) sb.st_size, f->real_file_size);
 			ret = glfs_ftruncate(g->fd, f->real_file_size);
 			if (ret) {
-				log_err("failed fio extend file %s to %ld\n",
+				log_err("failed fio extend file %s to %" PRIu64 "\n",
 					f->file_name, f->real_file_size);
 			} else {
 				unsigned long long left;
@@ -190,7 +190,7 @@
 
 					r = glfs_write(g->fd, b, bs, 0);
 					dprint(FD_IO,
-					       "fio write %d of %ld file %s\n",
+					       "fio write %d of %" PRIu64 " file %s\n",
 					       r, f->real_file_size,
 					       f->file_name);
 
diff -Nru fio-2.16/engines/glusterfs_sync.c fio-3.1/engines/glusterfs_sync.c
--- fio-2.16/engines/glusterfs_sync.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/glusterfs_sync.c	2017-09-28 10:23:20.000000000 +0000
@@ -7,7 +7,7 @@
 
 #include "gfapi.h"
 
-#define LAST_POS(f)	((f)->engine_data)
+#define LAST_POS(f)	((f)->engine_pos)
 static int fio_gf_prep(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
diff -Nru fio-2.16/engines/guasi.c fio-3.1/engines/guasi.c
--- fio-2.16/engines/guasi.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/guasi.c	2017-09-28 10:23:20.000000000 +0000
@@ -132,7 +132,7 @@
 {
 	int i;
 	struct io_u *io_u;
-	struct timeval now;
+	struct timespec now;
 
 	if (!fio_fill_issue_time(td))
 		return;
diff -Nru fio-2.16/engines/libaio.c fio-3.1/engines/libaio.c
--- fio-2.16/engines/libaio.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/libaio.c	2017-09-28 10:23:20.000000000 +0000
@@ -220,7 +220,7 @@
 static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us,
 			      unsigned int nr)
 {
-	struct timeval now;
+	struct timespec now;
 	unsigned int i;
 
 	if (!fio_fill_issue_time(td))
@@ -241,7 +241,7 @@
 	struct libaio_data *ld = td->io_ops_data;
 	struct iocb **iocbs;
 	struct io_u **io_us;
-	struct timeval tv;
+	struct timespec ts;
 	int ret, wait_start = 0;
 
 	if (!ld->queued)
@@ -282,9 +282,9 @@
 				break;
 			}
 			if (!wait_start) {
-				fio_gettime(&tv, NULL);
+				fio_gettime(&ts, NULL);
 				wait_start = 1;
-			} else if (mtime_since_now(&tv) > 30000) {
+			} else if (mtime_since_now(&ts) > 30000) {
 				log_err("fio: aio appears to be stalled, giving up\n");
 				break;
 			}
diff -Nru fio-2.16/engines/mmap.c fio-3.1/engines/mmap.c
--- fio-2.16/engines/mmap.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/mmap.c	2017-09-28 10:23:20.000000000 +0000
@@ -15,7 +15,7 @@
 #include "../verify.h"
 
 /*
- * Limits us to 1GB of mapped files in total
+ * Limits us to 1GiB of mapped files in total
  */
 #define MMAP_TOTAL_SZ	(1 * 1024 * 1024 * 1024UL)
 
@@ -67,7 +67,7 @@
 	}
 
 #ifdef FIO_MADV_FREE
-	if (f->filetype == FIO_TYPE_BD)
+	if (f->filetype == FIO_TYPE_BLOCK)
 		(void) posix_madvise(fmd->mmap_ptr, fmd->mmap_sz, FIO_MADV_FREE);
 #endif
 
diff -Nru fio-2.16/engines/mtd.c fio-3.1/engines/mtd.c
--- fio-2.16/engines/mtd.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/mtd.c	2017-09-28 10:23:20.000000000 +0000
@@ -13,6 +13,7 @@
 #include <mtd/mtd-user.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 #include "../verify.h"
 #include "../oslib/libmtd.h"
 
@@ -22,6 +23,28 @@
 	struct mtd_dev_info info;
 };
 
+struct fio_mtd_options {
+	void *pad; /* avoid off1 == 0 */
+	unsigned int skip_bad;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "skip_bad",
+		.lname	= "Skip operations against bad blocks",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct fio_mtd_options, skip_bad),
+		.help	= "Skip operations against known bad blocks.",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_MTD,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
 static int fio_mtd_maybe_mark_bad(struct thread_data *td,
 				  struct fio_mtd_data *fmd,
 				  struct io_u *io_u, int eb)
@@ -55,6 +78,7 @@
 {
 	struct fio_file *f = io_u->file;
 	struct fio_mtd_data *fmd = FILE_ENG_DATA(f);
+	struct fio_mtd_options *o = td->eo;
 	int local_offs = 0;
 	int ret;
 
@@ -77,7 +101,7 @@
 			      (int)fmd->info.eb_size - eb_offs);
 		char *buf = ((char *)io_u->buf) + local_offs;
 
-		if (td->o.skip_bad) {
+		if (o->skip_bad) {
 			ret = fio_mtd_is_bad(td, fmd, io_u, eb);
 			if (ret == -1)
 				break;
@@ -190,6 +214,8 @@
 	.close_file	= fio_mtd_close_file,
 	.get_file_size	= fio_mtd_get_file_size,
 	.flags		= FIO_SYNCIO | FIO_NOEXTEND,
+	.options	= options,
+	.option_struct_size	= sizeof(struct fio_mtd_options),
 };
 
 static void fio_init fio_mtd_register(void)
diff -Nru fio-2.16/engines/net.c fio-3.1/engines/net.c
--- fio-2.16/engines/net.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/net.c	2017-09-28 10:23:20.000000000 +0000
@@ -1218,7 +1218,7 @@
 			return 1;
 		}
 		if (is_ipv6(o)) {
-			log_err("fio: IPv6 not supported for multicast network IO");
+			log_err("fio: IPv6 not supported for multicast network IO\n");
 			close(fd);
 			return 1;
 		}
@@ -1371,7 +1371,7 @@
 	}
 
 	if (!td->io_ops_data) {
-		nd = malloc(sizeof(*nd));;
+		nd = malloc(sizeof(*nd));
 
 		memset(nd, 0, sizeof(*nd));
 		nd->listenfd = -1;
diff -Nru fio-2.16/engines/null.c fio-3.1/engines/null.c
--- fio-2.16/engines/null.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/null.c	2017-09-28 10:23:20.000000000 +0000
@@ -135,23 +135,21 @@
 
 #ifdef FIO_EXTERNAL_ENGINE
 extern "C" {
+static struct ioengine_ops ioengine;
 void get_ioengine(struct ioengine_ops **ioengine_ptr)
 {
-	struct ioengine_ops *ioengine;
+	*ioengine_ptr = &ioengine;
 
-	*ioengine_ptr = (struct ioengine_ops *) malloc(sizeof(struct ioengine_ops));
-	ioengine = *ioengine_ptr;
-
-	strcpy(ioengine->name, "cpp_null");
-	ioengine->version        = FIO_IOOPS_VERSION;
-	ioengine->queue          = fio_null_queue;
-	ioengine->commit         = fio_null_commit;
-	ioengine->getevents      = fio_null_getevents;
-	ioengine->event          = fio_null_event;
-	ioengine->init           = fio_null_init;
-	ioengine->cleanup        = fio_null_cleanup;
-	ioengine->open_file      = fio_null_open;
-	ioengine->flags	         = FIO_DISKLESSIO | FIO_FAKEIO;
+	ioengine.name           = "cpp_null";
+	ioengine.version        = FIO_IOOPS_VERSION;
+	ioengine.queue          = fio_null_queue;
+	ioengine.commit         = fio_null_commit;
+	ioengine.getevents      = fio_null_getevents;
+	ioengine.event          = fio_null_event;
+	ioengine.init           = fio_null_init;
+	ioengine.cleanup        = fio_null_cleanup;
+	ioengine.open_file      = fio_null_open;
+	ioengine.flags          = FIO_DISKLESSIO | FIO_FAKEIO;
 }
 }
 #endif /* FIO_EXTERNAL_ENGINE */
diff -Nru fio-2.16/engines/pmemblk.c fio-3.1/engines/pmemblk.c
--- fio-2.16/engines/pmemblk.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/pmemblk.c	2017-09-28 10:23:20.000000000 +0000
@@ -27,11 +27,11 @@
  *   ioengine=pmemblk
  *
  * Other relevant settings:
+ *   thread=1   REQUIRED
  *   iodepth=1
  *   direct=1
- *   thread=1   REQUIRED
  *   unlink=1
- *   filename=/pmem0/fiotestfile,BSIZE,FSIZEMB
+ *   filename=/mnt/pmem0/fiotestfile,BSIZE,FSIZEMiB
  *
  *   thread must be set to 1 for pmemblk as multiple processes cannot
  *     open the same block pool file.
@@ -39,23 +39,26 @@
  *   iodepth should be set to 1 as pmemblk is always synchronous.
  *   Use numjobs to scale up.
  *
- *   direct=1 is implied as pmemblk is always direct.
+ *   direct=1 is implied as pmemblk is always direct. A warning message
+ *   is printed if this is not specified.
+ *
+ *   unlink=1 removes the block pool file after testing, and is optional.
  *
- *   Can set unlink to 1 to remove the block pool file after testing.
+ *   The pmem device must have a DAX-capable filesystem and be mounted
+ *   with DAX enabled.  filename must point to a file on that filesystem.
+ *
+ *   Example:
+ *     mkfs.xfs /dev/pmem0
+ *     mkdir /mnt/pmem0
+ *     mount -o dax /dev/pmem0 /mnt/pmem0
  *
  *   When specifying the filename, if the block pool file does not already
- *   exist, then the pmemblk engine can create the pool file if you specify
+ *   exist, then the pmemblk engine creates the pool file if you specify
  *   the block and file sizes.  BSIZE is the block size in bytes.
- *   FSIZEMB is the pool file size in MB.
+ *   FSIZEMB is the pool file size in MiB.
  *
  *   See examples/pmemblk.fio for more.
  *
- * libpmemblk.so
- *   By default, the pmemblk engine will let the system find the libpmemblk.so
- *   that it uses.  You can use an alternative libpmemblk by setting the
- *   FIO_PMEMBLK_LIB environment variable to the full path to the desired
- *   libpmemblk.so.
- *
  */
 
 #include <stdio.h>
@@ -64,68 +67,15 @@
 #include <sys/uio.h>
 #include <errno.h>
 #include <assert.h>
-#include <dlfcn.h>
 #include <string.h>
+#include <libpmem.h>
+#include <libpmemblk.h>
 
 #include "../fio.h"
 
 /*
  * libpmemblk
  */
-struct PMEMblkpool_s;
-typedef struct PMEMblkpool_s PMEMblkpool;
-
-static PMEMblkpool *(*pmemblk_create) (const char *, size_t, size_t, mode_t);
-static PMEMblkpool *(*pmemblk_open) (const char *, size_t);
-static void (*pmemblk_close) (PMEMblkpool *);
-static size_t(*pmemblk_nblock) (PMEMblkpool *);
-static size_t(*pmemblk_bsize) (PMEMblkpool *);
-static int (*pmemblk_read) (PMEMblkpool *, void *, off_t);
-static int (*pmemblk_write) (PMEMblkpool *, const void *, off_t);
-
-int load_libpmemblk(const char *path)
-{
-	void *dl;
-
-	if (!path)
-		path = "libpmemblk.so";
-
-	dl = dlopen(path, RTLD_NOW | RTLD_NODELETE);
-	if (!dl)
-		goto errorout;
-
-	pmemblk_create = dlsym(dl, "pmemblk_create");
-	if (!pmemblk_create)
-		goto errorout;
-	pmemblk_open = dlsym(dl, "pmemblk_open");
-	if (!pmemblk_open)
-		goto errorout;
-	pmemblk_close = dlsym(dl, "pmemblk_close");
-	if (!pmemblk_close)
-		goto errorout;
-	pmemblk_nblock = dlsym(dl, "pmemblk_nblock");
-	if (!pmemblk_nblock)
-		goto errorout;
-	pmemblk_bsize = dlsym(dl, "pmemblk_bsize");
-	if (!pmemblk_bsize)
-		goto errorout;
-	pmemblk_read = dlsym(dl, "pmemblk_read");
-	if (!pmemblk_read)
-		goto errorout;
-	pmemblk_write = dlsym(dl, "pmemblk_write");
-	if (!pmemblk_write)
-		goto errorout;
-
-	return 0;
-
-errorout:
-	log_err("fio: unable to load libpmemblk: %s\n", dlerror());
-	if (dl)
-		dlclose(dl);
-
-	return -1;
-}
-
 typedef struct fio_pmemblk_file *fio_pmemblk_file_t;
 
 struct fio_pmemblk_file {
@@ -136,10 +86,6 @@
 	size_t pmb_bsize;
 	size_t pmb_nblocks;
 };
-#define FIOFILEPMBSET(_f, _v)  do {                 \
-	(_f)->engine_data = (uint64_t)(uintptr_t)(_v);  \
-} while(0)
-#define FIOFILEPMBGET(_f)  ((fio_pmemblk_file_t)((_f)->engine_data))
 
 static fio_pmemblk_file_t Cache;
 
@@ -187,7 +133,7 @@
  * level, we allow the block size and file size to be appended
  * to the file name:
  *
- *   path[,bsize,fsizemb]
+ *   path[,bsize,fsizemib]
  *
  * note that we do not use the fio option "filesize" to dictate
  * the file size because we can only give libpmemblk the gross
@@ -197,7 +143,7 @@
  * the final path without the parameters is returned in ppath.
  * the block size and file size are returned in pbsize and fsize.
  *
- * note that the user should specify the file size in MiB, but
+ * note that the user specifies the file size in MiB, but
  * we return bytes from here.
  */
 static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize,
@@ -206,7 +152,7 @@
 	char *path;
 	char *s;
 	uint64_t bsize;
-	uint64_t fsizemb;
+	uint64_t fsizemib;
 
 	path = strdup(pathspec);
 	if (!path) {
@@ -216,14 +162,14 @@
 
 	/* extract sizes, if given */
 	s = strrchr(path, ',');
-	if (s && (fsizemb = strtoull(s + 1, NULL, 10))) {
+	if (s && (fsizemib = strtoull(s + 1, NULL, 10))) {
 		*s = 0;
 		s = strrchr(path, ',');
 		if (s && (bsize = strtoull(s + 1, NULL, 10))) {
 			*s = 0;
 			*ppath = path;
 			*pbsize = bsize;
-			*pfsize = fsizemb << 20;
+			*pfsize = fsizemib << 20;
 			return;
 		}
 	}
@@ -250,11 +196,6 @@
 
 	pmb = fio_pmemblk_cache_lookup(path);
 	if (!pmb) {
-		/* load libpmemblk if needed */
-		if (!pmemblk_open)
-			if (load_libpmemblk(getenv("FIO_PMEMBLK_LIB")))
-				goto error;
-
 		pmb = malloc(sizeof(*pmb));
 		if (!pmb)
 			goto error;
@@ -267,9 +208,8 @@
 			    pmemblk_create(path, bsize, fsize, 0644);
 		}
 		if (!pmb->pmb_pool) {
-			log_err
-			    ("fio: enable to open pmemblk pool file (errno %d)\n",
-			     errno);
+			log_err("pmemblk: unable to open pmemblk pool file %s (%s)\n",
+			     path, strerror(errno));
 			goto error;
 		}
 
@@ -331,14 +271,14 @@
 	if (!td->o.use_thread) {
 		if (!thread_warned) {
 			thread_warned = 1;
-			log_err("fio: must set thread=1 for pmemblk engine\n");
+			log_err("pmemblk: must set thread=1 for pmemblk engine\n");
 		}
 		return 1;
 	}
 
 	if (!td->o.odirect && !odirect_warned) {
 		odirect_warned = 1;
-		log_info("fio: direct == 0, but pmemblk is always direct\n");
+		log_info("pmemblk: direct == 0, but pmemblk is always direct\n");
 	}
 
 	if (td->o.allow_create)
@@ -360,26 +300,26 @@
 	if (!pmb)
 		return 1;
 
-	FIOFILEPMBSET(f, pmb);
+	FILE_SET_ENG_DATA(f, pmb);
 	return 0;
 }
 
 static int fio_pmemblk_close_file(struct thread_data fio_unused *td,
 				  struct fio_file *f)
 {
-	fio_pmemblk_file_t pmb = FIOFILEPMBGET(f);
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
 
 	if (pmb)
 		pmb_close(pmb, false);
 
-	FIOFILEPMBSET(f, NULL);
+	FILE_SET_ENG_DATA(f, NULL);
 	return 0;
 }
 
 static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f)
 {
 	uint64_t flags = 0;
-	fio_pmemblk_file_t pmb = FIOFILEPMBGET(f);
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
 
 	if (fio_file_size_known(f))
 		return 0;
@@ -396,7 +336,7 @@
 
 	fio_file_set_size_known(f);
 
-	if (!FIOFILEPMBGET(f))
+	if (!FILE_ENG_DATA(f))
 		pmb_close(pmb, true);
 
 	return 0;
@@ -405,19 +345,16 @@
 static int fio_pmemblk_queue(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
-	fio_pmemblk_file_t pmb = FIOFILEPMBGET(f);
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
 
 	unsigned long long off;
 	unsigned long len;
 	void *buf;
-	int (*blkop) (PMEMblkpool *, void *, off_t) = (void *)pmemblk_write;
 
 	fio_ro_check(td, io_u);
 
 	switch (io_u->ddir) {
 	case DDIR_READ:
-		blkop = pmemblk_read;
-		/* fall through */
 	case DDIR_WRITE:
 		off = io_u->offset;
 		len = io_u->xfer_buflen;
@@ -435,7 +372,11 @@
 		off /= pmb->pmb_bsize;
 		len /= pmb->pmb_bsize;
 		while (0 < len) {
-			if (0 != blkop(pmb->pmb_pool, buf, off)) {
+			if (io_u->ddir == DDIR_READ &&
+			   0 != pmemblk_read(pmb->pmb_pool, buf, off)) {
+				io_u->error = errno;
+				break;
+			} else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) {
 				io_u->error = errno;
 				break;
 			}
@@ -482,7 +423,7 @@
 	return 0;
 }
 
-struct ioengine_ops ioengine = {
+static struct ioengine_ops ioengine = {
 	.name = "pmemblk",
 	.version = FIO_IOOPS_VERSION,
 	.queue = fio_pmemblk_queue,
diff -Nru fio-2.16/engines/rbd.c fio-3.1/engines/rbd.c
--- fio-2.16/engines/rbd.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/rbd.c	2017-09-28 10:23:20.000000000 +0000
@@ -36,6 +36,7 @@
 	struct io_u **aio_events;
 	struct io_u **sort_events;
 	int fd; /* add for poll */
+	bool connected;
 };
 
 struct rbd_options {
@@ -111,6 +112,8 @@
 	if (!rbd)
 		goto failed;
 
+	rbd->connected = false;
+
 	/* add for poll, init fd: -1 */
 	rbd->fd = -1;
 
@@ -287,7 +290,7 @@
 	 */
 	ret = rbd_aio_get_return_value(fri->completion);
 	if (ret < 0) {
-		io_u->error = ret;
+		io_u->error = -ret;
 		io_u->resid = io_u->xfer_buflen;
 	} else
 		io_u->error = 0;
@@ -514,6 +517,7 @@
 	} else {
 		dprint(FD_IO, "%s: Warning: unhandled ddir: %d\n", __func__,
 		       io_u->ddir);
+		r = -EINVAL;
 		goto failed_comp;
 	}
 
@@ -521,7 +525,7 @@
 failed_comp:
 	rbd_aio_release(fri->completion);
 failed:
-	io_u->error = r;
+	io_u->error = -r;
 	td_verror(td, io_u->error, "xfer");
 	return FIO_Q_COMPLETED;
 }
@@ -529,6 +533,10 @@
 static int fio_rbd_init(struct thread_data *td)
 {
 	int r;
+	struct rbd_data *rbd = td->io_ops_data;
+
+	if (rbd->connected)
+		return 0;
 
 	r = _fio_rbd_connect(td);
 	if (r) {
@@ -559,13 +567,8 @@
 	rbd_image_info_t info;
 	struct fio_file *f;
 	struct rbd_data *rbd = NULL;
-	int major, minor, extra;
 	int r;
 
-	/* log version of librbd. No cluster connection required. */
-	rbd_version(&major, &minor, &extra);
-	log_info("rbd engine: RBD version: %d.%d.%d\n", major, minor, extra);
-
 	/* allocate engine specific structure to deal with librbd. */
 	r = _fio_setup_rbd_data(td, &rbd);
 	if (r) {
@@ -589,19 +592,20 @@
 		log_err("fio_rbd_connect failed.\n");
 		goto cleanup;
 	}
+	rbd->connected = true;
 
 	/* get size of the RADOS block device */
 	r = rbd_stat(rbd->image, &info, sizeof(info));
 	if (r < 0) {
 		log_err("rbd_status failed.\n");
-		goto disconnect;
+		goto cleanup;
 	} else if (info.size == 0) {
 		log_err("image size should be larger than zero.\n");
 		r = -EINVAL;
-		goto disconnect;
+		goto cleanup;
 	}
 
-	dprint(FD_IO, "rbd-engine: image size: %lu\n", info.size);
+	dprint(FD_IO, "rbd-engine: image size: %" PRIu64 "\n", info.size);
 
 	/* taken from "net" engine. Pretend we deal with files,
 	 * even if we do not have any ideas about files.
@@ -615,14 +619,8 @@
 	f = td->files[0];
 	f->real_file_size = info.size;
 
-	/* disconnect, then we were only connected to determine
-	 * the size of the RBD.
-	 */
-	_fio_rbd_disconnect(rbd);
 	return 0;
 
-disconnect:
-	_fio_rbd_disconnect(rbd);
 cleanup:
 	fio_rbd_cleanup(td);
 	return r;
diff -Nru fio-2.16/engines/rdma.c fio-3.1/engines/rdma.c
--- fio-2.16/engines/rdma.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/rdma.c	2017-09-28 10:23:20.000000000 +0000
@@ -44,7 +44,6 @@
 #include "../optgroup.h"
 
 #include <rdma/rdma_cma.h>
-#include <infiniband/arch.h>
 
 #define FIO_RDMA_MAX_IO_DEPTH    512
 
@@ -216,7 +215,7 @@
 		rd->rmt_nr = ntohl(rd->recv_buf.nr);
 
 		for (i = 0; i < rd->rmt_nr; i++) {
-			rd->rmt_us[i].buf = ntohll(rd->recv_buf.rmt_us[i].buf);
+			rd->rmt_us[i].buf = be64_to_cpu(rd->recv_buf.rmt_us[i].buf);
 			rd->rmt_us[i].rkey = ntohl(rd->recv_buf.rmt_us[i].rkey);
 			rd->rmt_us[i].size = ntohl(rd->recv_buf.rmt_us[i].size);
 
@@ -802,7 +801,7 @@
 			      unsigned int nr)
 {
 	struct rdmaio_data *rd = td->io_ops_data;
-	struct timeval now;
+	struct timespec now;
 	unsigned int i;
 
 	if (!fio_fill_issue_time(td))
@@ -881,7 +880,7 @@
 	rd->send_buf.nr = htonl(td->o.iodepth);
 
 	if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-		log_err("fio: ibv_post_send fail: %m");
+		log_err("fio: ibv_post_send fail: %m\n");
 		return 1;
 	}
 
@@ -932,7 +931,7 @@
 	ret = rdma_poll_wait(td, IBV_WC_RECV) < 0;
 
 	if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-		log_err("fio: ibv_post_send fail: %m");
+		log_err("fio: ibv_post_send fail: %m\n");
 		return 1;
 	}
 
@@ -965,7 +964,7 @@
 				     || (rd->rdma_protocol ==
 					 FIO_RDMA_MEM_READ))) {
 		if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-			log_err("fio: ibv_post_send fail: %m");
+			log_err("fio: ibv_post_send fail: %m\n");
 			return 1;
 		}
 
@@ -1300,7 +1299,7 @@
 		}
 
 		rd->send_buf.rmt_us[i].buf =
-		    htonll((uint64_t) (unsigned long)io_u->buf);
+		    cpu_to_be64((uint64_t) (unsigned long)io_u->buf);
 		rd->send_buf.rmt_us[i].rkey = htonl(io_u->mr->rkey);
 		rd->send_buf.rmt_us[i].size = htonl(max_bs);
 
diff -Nru fio-2.16/engines/sg.c fio-3.1/engines/sg.c
--- fio-2.16/engines/sg.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/sg.c	2017-09-28 10:23:20.000000000 +0000
@@ -20,7 +20,7 @@
 #define MAX_SB 64               // sense block maximum return size
 
 struct sgio_cmd {
-	unsigned char cdb[16];  	// increase to support 16 byte commands
+	unsigned char cdb[16];      // enhanced from 10 to support 16 byte commands
 	unsigned char sb[MAX_SB];   // add sense block to commands
 	int nr;
 };
@@ -32,7 +32,6 @@
 	int *fd_flags;
 	void *sgbuf;
 	unsigned int bs;
-	long long max_lba;
 	int type_checked;
 };
 
@@ -125,7 +124,7 @@
 	}
 
 	while (left) {
-		void *p;
+		char *p;
 
 		dprint(FD_IO, "sgio_getevents: sd %p: left=%d\n", sd, left);
 
@@ -185,7 +184,7 @@
 			if (hdr->info & SG_INFO_CHECK) {
 				struct io_u *io_u;
 				io_u = (struct io_u *)(hdr->usr_ptr);
-				memcpy((void*)&(io_u->hdr), (void*)hdr, sizeof(struct sg_io_hdr));
+				memcpy(&io_u->hdr, hdr, sizeof(struct sg_io_hdr));
 				sd->events[i]->error = EIO;
 			}
 		}
@@ -253,7 +252,7 @@
 	struct fio_file *f = io_u->file;
 	int ret;
 
-	if (f->filetype == FIO_TYPE_BD) {
+	if (f->filetype == FIO_TYPE_BLOCK) {
 		ret = fio_sgio_ioctl_doio(td, f, io_u);
 		td->error = io_u->error;
 	} else {
@@ -309,7 +308,6 @@
 	 * blocks on medium.
 	 */
 	if (hdr->dxfer_direction != SG_DXFER_NONE) {
-
 		if (lba < MAX_10B_LBA) {
 			hdr->cmdp[2] = (unsigned char) ((lba >> 24) & 0xff);
 			hdr->cmdp[3] = (unsigned char) ((lba >> 16) & 0xff);
@@ -416,17 +414,16 @@
 	}
 
 	*bs	 = (buf[4] << 24) | (buf[5] << 16) | (buf[6] << 8) | buf[7];
-	*max_lba = ((buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]) & 0x00000000FFFFFFFFULL;  // for some reason max_lba is being sign extended even though unsigned.
-
+	*max_lba = ((buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]) & MAX_10B_LBA;  // for some reason max_lba is being sign extended even though unsigned.
 
 	/*
-	 * If max lba is 0xFFFFFFFF, then need to retry with
-	 * 16 byteread capacity
+	 * If max lba masked by MAX_10B_LBA equals MAX_10B_LBA,
+	 * then need to retry with 16 byte Read Capacity command.
 	 */
 	if (*max_lba == MAX_10B_LBA) {
 		hdr.cmd_len = 16;
-		hdr.cmdp[0] = 0x9e; // Read Capacity(16)
-		hdr.cmdp[1] = 0x10; // service action
+		hdr.cmdp[0] = 0x9e; // service action
+		hdr.cmdp[1] = 0x10; // Read Capacity(16)
 		hdr.cmdp[10] = (unsigned char) ((sizeof(buf) >> 24) & 0xff);
 		hdr.cmdp[11] = (unsigned char) ((sizeof(buf) >> 16) & 0xff);
 		hdr.cmdp[12] = (unsigned char) ((sizeof(buf) >> 8) & 0xff);
@@ -507,8 +504,7 @@
 	unsigned int bs = 0;
 	unsigned long long max_lba = 0;
 
-
-	if (f->filetype == FIO_TYPE_BD) {
+	if (f->filetype == FIO_TYPE_BLOCK) {
 		if (ioctl(f->fd, BLKSSZGET, &bs) < 0) {
 			td_verror(td, errno, "ioctl");
 			return 1;
@@ -529,19 +525,19 @@
 		}
 	} else {
 		td_verror(td, EINVAL, "wrong file type");
-		log_err("ioengine sg only works on block devices\n");
+		log_err("ioengine sg only works on block or character devices\n");
 		return 1;
 	}
 
 	sd->bs = bs;
 	// Determine size of commands needed based on max_lba
-	sd->max_lba = max_lba;
-	if (max_lba > MAX_10B_LBA) {
-		dprint(FD_IO, "sgio_type_check: using 16 byte operations: max_lba = 0x%016llx\n", max_lba);
+	if (max_lba >= MAX_10B_LBA) {
+		dprint(FD_IO, "sgio_type_check: using 16 byte read/write "
+			"commands for lba above 0x%016llx/0x%016llx\n",
+			MAX_10B_LBA, max_lba);
 	}
 
-
-	if (f->filetype == FIO_TYPE_BD) {
+	if (f->filetype == FIO_TYPE_BLOCK) {
 		td->io_ops->getevents = NULL;
 		td->io_ops->event = NULL;
 	}
@@ -576,17 +572,17 @@
 	struct sg_io_hdr *hdr = &io_u->hdr;
 #define MAXERRDETAIL 1024
 #define MAXMSGCHUNK  128
-	char *msg, msgchunk[MAXMSGCHUNK], *ret = NULL;
+	char *msg, msgchunk[MAXMSGCHUNK];
 	int i;
 
-	msg = calloc(MAXERRDETAIL, 1);
+	msg = calloc(1, MAXERRDETAIL);
+	strcpy(msg, "");
 
 	/*
 	 * can't seem to find sg_err.h, so I'll just echo the define values
 	 * so others can search on internet to find clearer clues of meaning.
 	 */
 	if (hdr->info & SG_INFO_CHECK) {
-		ret = msg;
 		if (hdr->host_status) {
 			snprintf(msgchunk, MAXMSGCHUNK, "SG Host Status: 0x%02x; ", hdr->host_status);
 			strlcat(msg, msgchunk, MAXERRDETAIL);
@@ -630,6 +626,24 @@
 			case 0x0d:
 				strlcat(msg, "SG_ERR_DID_REQUEUE", MAXERRDETAIL);
 				break;
+			case 0x0e:
+				strlcat(msg, "SG_ERR_DID_TRANSPORT_DISRUPTED", MAXERRDETAIL);
+				break;
+			case 0x0f:
+				strlcat(msg, "SG_ERR_DID_TRANSPORT_FAILFAST", MAXERRDETAIL);
+				break;
+			case 0x10:
+				strlcat(msg, "SG_ERR_DID_TARGET_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x11:
+				strlcat(msg, "SG_ERR_DID_NEXUS_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x12:
+				strlcat(msg, "SG_ERR_DID_ALLOC_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x13:
+				strlcat(msg, "SG_ERR_DID_MEDIUM_ERROR", MAXERRDETAIL);
+				break;
 			default:
 				strlcat(msg, "Unknown", MAXERRDETAIL);
 				break;
@@ -741,14 +755,14 @@
 		if (hdr->resid != 0) {
 			snprintf(msgchunk, MAXMSGCHUNK, "SG Driver: %d bytes out of %d not transferred. ", hdr->resid, hdr->dxfer_len);
 			strlcat(msg, msgchunk, MAXERRDETAIL);
-			ret = msg;
 		}
 	}
 
-	if (!ret)
-		ret = strdup("SG Driver did not report a Host, Driver or Device check");
+	if (!(hdr->info & SG_INFO_CHECK) && !strlen(msg))
+		strncpy(msg, "SG Driver did not report a Host, Driver or Device check",
+			MAXERRDETAIL - 1);
 
-	return ret;
+	return msg;
 }
 
 /*
@@ -775,6 +789,12 @@
 	if (fio_file_size_known(f))
 		return 0;
 
+	if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) {
+		td_verror(td, EINVAL, "wrong file type");
+		log_err("ioengine sg only works on block or character devices\n");
+		return 1;
+	}
+
 	ret = fio_sgio_read_capacity(td, &bs, &max_lba);
 	if (ret ) {
 		td_verror(td, td->error, "fio_sgio_read_capacity");
@@ -800,7 +820,7 @@
 	.cleanup	= fio_sgio_cleanup,
 	.open_file	= fio_sgio_open,
 	.close_file	= generic_close_file,
-	.get_file_size	= fio_sgio_get_file_size, // generic_get_file_size
+	.get_file_size	= fio_sgio_get_file_size,
 	.flags		= FIO_SYNCIO | FIO_RAWIO,
 };
 
diff -Nru fio-2.16/engines/skeleton_external.c fio-3.1/engines/skeleton_external.c
--- fio-2.16/engines/skeleton_external.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/skeleton_external.c	2017-09-28 10:23:20.000000000 +0000
@@ -3,7 +3,8 @@
  *
  * Should be compiled with:
  *
- * gcc -Wall -O2 -g -shared -rdynamic -fPIC -o engine.o engine.c
+ * gcc -Wall -O2 -g -shared -rdynamic -fPIC -o skeleton_external.o skeleton_external.c
+ * (also requires -D_GNU_SOURCE -DCONFIG_STRSEP on Linux)
  *
  */
 #include <stdio.h>
@@ -13,6 +14,7 @@
 #include <assert.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 
 /*
  * The core of the module is identical to the ones included with fio,
@@ -21,6 +23,32 @@
  */
 
 /*
+ * The io engine can define its own options within the io engine source.
+ * The option member must not be at offset 0, due to the way fio parses
+ * the given option. Just add a padding pointer unless the io engine has
+ * something usable.
+ */
+struct fio_skeleton_options {
+	void *pad; /* avoid ->off1 of fio_option becomes 0 */
+	unsigned int dummy;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "dummy",
+		.lname	= "ldummy",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct fio_skeleton_options, dummy),
+		.help	= "Set dummy",
+		.category = FIO_OPT_C_ENGINE, /* always use this */
+		.group	= FIO_OPT_G_INVALID, /* this can be different */
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+/*
  * The ->event() hook is called to match an event number with an io_u.
  * After the core has called ->getevents() and it has returned eg 3,
  * the ->event() hook must return the 3 events that have completed for
@@ -109,11 +137,11 @@
 
 /*
  * Hook for opening the given file. Unless the engine has special
- * needs, it usually just provides generic_file_open() as the handler.
+ * needs, it usually just provides generic_open_file() as the handler.
  */
 static int fio_skeleton_open(struct thread_data *td, struct fio_file *f)
 {
-	return generic_file_open(td, f);
+	return generic_open_file(td, f);
 }
 
 /*
@@ -121,12 +149,12 @@
  */
 static int fio_skeleton_close(struct thread_data *td, struct fio_file *f)
 {
-	generic_file_close(td, f);
+	return generic_close_file(td, f);
 }
 
 /*
  * Note that the structure is exported, so that fio can get it via
- * dlsym(..., "ioengine");
+ * dlsym(..., "ioengine"); for (and only for) external engines.
  */
 struct ioengine_ops ioengine = {
 	.name		= "engine_name",
@@ -140,4 +168,6 @@
 	.cleanup	= fio_skeleton_cleanup,
 	.open_file	= fio_skeleton_open,
 	.close_file	= fio_skeleton_close,
+	.options	= options,
+	.option_struct_size	= sizeof(struct fio_skeleton_options),
 };
diff -Nru fio-2.16/engines/splice.c fio-3.1/engines/splice.c
--- fio-2.16/engines/splice.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/splice.c	2017-09-28 10:23:20.000000000 +0000
@@ -32,7 +32,7 @@
 	struct fio_file *f = io_u->file;
 	int ret, ret2, buflen;
 	off_t offset;
-	void *p;
+	char *p;
 
 	offset = io_u->offset;
 	buflen = io_u->xfer_buflen;
@@ -77,7 +77,8 @@
 	struct iovec iov;
 	int ret , buflen, mmap_len;
 	off_t offset;
-	void *p, *map;
+	void *map;
+	char *p;
 
 	ret = 0;
 	offset = io_u->offset;
diff -Nru fio-2.16/engines/sync.c fio-3.1/engines/sync.c
--- fio-2.16/engines/sync.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/sync.c	2017-09-28 10:23:20.000000000 +0000
@@ -14,11 +14,12 @@
 
 #include "../fio.h"
 #include "../optgroup.h"
+#include "../lib/rand.h"
 
 /*
  * Sync engine uses engine_data to store last offset
  */
-#define LAST_POS(f)	((f)->engine_data)
+#define LAST_POS(f)	((f)->engine_pos)
 
 struct syncio_data {
 	struct iovec *iovecs;
@@ -30,12 +31,15 @@
 	unsigned long long last_offset;
 	struct fio_file *last_file;
 	enum fio_ddir last_ddir;
+
+	struct frand_state rand_state;
 };
 
 #ifdef FIO_HAVE_PWRITEV2
 struct psyncv2_options {
 	void *pad;
 	unsigned int hipri;
+	unsigned int hipri_percentage;
 };
 
 static struct fio_option options[] = {
@@ -49,6 +53,18 @@
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "hipri_percentage",
+		.lname	= "RWF_HIPRI_PERCENTAGE",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct psyncv2_options, hipri_percentage),
+		.minval	= 0,
+		.maxval	= 100,
+		.def    = "100",
+		.help	= "Probabilistically set RWF_HIPRI for pwritev2/preadv2",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= NULL,
 	},
 };
@@ -132,7 +148,8 @@
 
 	fio_ro_check(td, io_u);
 
-	if (o->hipri)
+	if (o->hipri &&
+	    (rand32_between(&sd->rand_state, 1, 100) <= o->hipri_percentage))
 		flags |= RWF_HIPRI;
 
 	iov->iov_base = io_u->xfer_buf;
@@ -363,6 +380,7 @@
 	sd->last_offset = -1ULL;
 	sd->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
 	sd->io_us = malloc(td->o.iodepth * sizeof(struct io_u *));
+	init_rand(&sd->rand_state, 0);
 
 	td->io_ops_data = sd;
 	return 0;
diff -Nru fio-2.16/engines/windowsaio.c fio-3.1/engines/windowsaio.c
--- fio-2.16/engines/windowsaio.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/engines/windowsaio.c	2017-09-28 10:23:20.000000000 +0000
@@ -35,17 +35,7 @@
 	struct windowsaio_data *wd;
 };
 
-static BOOL timeout_expired(DWORD start_count, DWORD end_count);
-static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
-				unsigned int max, const struct timespec *t);
-static struct io_u *fio_windowsaio_event(struct thread_data *td, int event);
-static int fio_windowsaio_queue(struct thread_data *td,
-				  struct io_u *io_u);
-static void fio_windowsaio_cleanup(struct thread_data *td);
 static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter);
-static int fio_windowsaio_init(struct thread_data *td);
-static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f);
-static int fio_windowsaio_close_file(struct thread_data fio_unused *td, struct fio_file *f);
 
 static int fio_windowsaio_init(struct thread_data *td)
 {
@@ -152,7 +142,6 @@
 	}
 }
 
-
 static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f)
 {
 	int rc = 0;
@@ -180,13 +169,26 @@
 
 	/*
 	 * Inform Windows whether we're going to be doing sequential or
-	 * random io so it can tune the Cache Manager
+	 * random IO so it can tune the Cache Manager
 	 */
-	if (td->o.td_ddir == TD_DDIR_READ  ||
-		td->o.td_ddir == TD_DDIR_WRITE)
-		flags |= FILE_FLAG_SEQUENTIAL_SCAN;
-	else
+	switch (td->o.fadvise_hint) {
+	case F_ADV_TYPE:
+		if (td_random(td))
+			flags |= FILE_FLAG_RANDOM_ACCESS;
+		else
+			flags |= FILE_FLAG_SEQUENTIAL_SCAN;
+		break;
+	case F_ADV_RANDOM:
 		flags |= FILE_FLAG_RANDOM_ACCESS;
+		break;
+	case F_ADV_SEQUENTIAL:
+		flags |= FILE_FLAG_SEQUENTIAL_SCAN;
+		break;
+	case F_ADV_NONE:
+		break;
+	default:
+		log_err("fio: unknown fadvise type %d\n", td->o.fadvise_hint);
+	}
 
 	if (!td_write(td) || read_only)
 		access = GENERIC_READ;
diff -Nru fio-2.16/eta.c fio-3.1/eta.c
--- fio-2.16/eta.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/eta.c	2017-09-28 10:23:20.000000000 +0000
@@ -308,7 +308,7 @@
 
 		diff = io_bytes[i] - prev_io_bytes[i];
 		if (mtime)
-			this_rate = ((1000 * diff) / mtime) / 1024;
+			this_rate = ((1000 * diff) / mtime) / 1024; /* KiB/s */
 		else
 			this_rate = 0;
 
@@ -358,12 +358,12 @@
 	uint64_t rate_time, disp_time, bw_avg_time, *eta_secs;
 	unsigned long long io_bytes[DDIR_RWDIR_CNT];
 	unsigned long long io_iops[DDIR_RWDIR_CNT];
-	struct timeval now;
+	struct timespec now;
 
 	static unsigned long long rate_io_bytes[DDIR_RWDIR_CNT];
 	static unsigned long long disp_io_bytes[DDIR_RWDIR_CNT];
 	static unsigned long long disp_io_iops[DDIR_RWDIR_CNT];
-	static struct timeval rate_prev_time, disp_prev_time;
+	static struct timespec rate_prev_time, disp_prev_time;
 
 	if (!force) {
 		if (!(output_format & FIO_OUTPUT_NORMAL) &&
@@ -440,7 +440,7 @@
 		if (td->runstate > TD_SETTING_UP) {
 			int ddir;
 
-			for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+			for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 				if (unified_rw_rep) {
 					io_bytes[0] += td->io_bytes[ddir];
 					io_iops[0] += td->io_blocks[ddir];
@@ -511,7 +511,7 @@
 
 void display_thread_status(struct jobs_eta *je)
 {
-	static struct timeval disp_eta_new_line;
+	static struct timespec disp_eta_new_line;
 	static int eta_new_line_init, eta_new_line_pending;
 	static int linelen_last;
 	static int eta_good;
@@ -530,19 +530,28 @@
 	}
 
 	p += sprintf(p, "Jobs: %d (f=%d)", je->nr_running, je->files_open);
-	if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) {
+
+	/* rate limits, if any */
+	if (je->m_rate[0] || je->m_rate[1] || je->m_rate[2] ||
+	    je->t_rate[0] || je->t_rate[1] || je->t_rate[2]) {
 		char *tr, *mr;
 
-		mr = num2str(je->m_rate[0] + je->m_rate[1], 4, 0, je->is_pow2, 8);
-		tr = num2str(je->t_rate[0] + je->t_rate[1], 4, 0, je->is_pow2, 8);
-		p += sprintf(p, ", CR=%s/%s KB/s", tr, mr);
+		mr = num2str(je->m_rate[0] + je->m_rate[1] + je->m_rate[2],
+				4, 0, je->is_pow2, N2S_BYTEPERSEC);
+		tr = num2str(je->t_rate[0] + je->t_rate[1] + je->t_rate[2],
+				4, 0, je->is_pow2, N2S_BYTEPERSEC);
+
+		p += sprintf(p, ", %s-%s", mr, tr);
 		free(tr);
 		free(mr);
-	} else if (je->m_iops[0] || je->m_iops[1] || je->t_iops[0] || je->t_iops[1]) {
-		p += sprintf(p, ", CR=%d/%d IOPS",
-					je->t_iops[0] + je->t_iops[1],
-					je->m_iops[0] + je->m_iops[1]);
+	} else if (je->m_iops[0] || je->m_iops[1] || je->m_iops[2] ||
+		   je->t_iops[0] || je->t_iops[1] || je->t_iops[2]) {
+		p += sprintf(p, ", %d-%d IOPS",
+					je->m_iops[0] + je->m_iops[1] + je->m_iops[2],
+					je->t_iops[0] + je->t_iops[1] + je->t_iops[2]);
 	}
+
+	/* current run string, % done, bandwidth, iops, eta */
 	if (je->eta_sec != INT_MAX && je->nr_running) {
 		char perc_str[32];
 		char *iops_str[DDIR_RWDIR_CNT];
@@ -553,7 +562,7 @@
 
 		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running ||
 		    je->eta_sec == -1)
-			strcpy(perc_str, "-.-% done");
+			strcpy(perc_str, "-.-%");
 		else {
 			double mult = 100.0;
 
@@ -562,28 +571,37 @@
 
 			eta_good = 1;
 			perc *= mult;
-			sprintf(perc_str, "%3.1f%% done", perc);
+			sprintf(perc_str, "%3.1f%%", perc);
 		}
 
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
-			rate_str[ddir] = num2str(je->rate[ddir], 5,
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			rate_str[ddir] = num2str(je->rate[ddir], 4,
 						1024, je->is_pow2, je->unit_base);
-			iops_str[ddir] = num2str(je->iops[ddir], 4, 1, 0, 0);
+			iops_str[ddir] = num2str(je->iops[ddir], 4, 1, 0, N2S_NONE);
 		}
 
 		left = sizeof(output) - (p - output) - 1;
 
-		l = snprintf(p, left, ": [%s] [%s] [%s/%s/%s /s] [%s/%s/%s iops] [eta %s]",
+		if (je->rate[DDIR_TRIM] || je->iops[DDIR_TRIM])
+			l = snprintf(p, left,
+				": [%s][%s][r=%s,w=%s,t=%s][r=%s,w=%s,t=%s IOPS][eta %s]",
 				je->run_str, perc_str, rate_str[DDIR_READ],
 				rate_str[DDIR_WRITE], rate_str[DDIR_TRIM],
 				iops_str[DDIR_READ], iops_str[DDIR_WRITE],
 				iops_str[DDIR_TRIM], eta_str);
+		else
+			l = snprintf(p, left,
+				": [%s][%s][r=%s,w=%s][r=%s,w=%s IOPS][eta %s]",
+				je->run_str, perc_str,
+				rate_str[DDIR_READ], rate_str[DDIR_WRITE],
+				iops_str[DDIR_READ], iops_str[DDIR_WRITE],
+				eta_str);
 		p += l;
 		if (l >= 0 && l < linelen_last)
 			p += sprintf(p, "%*s", linelen_last - l, "");
 		linelen_last = l;
 
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 			free(rate_str[ddir]);
 			free(iops_str[ddir]);
 		}
diff -Nru fio-2.16/examples/butterfly.fio fio-3.1/examples/butterfly.fio
--- fio-2.16/examples/butterfly.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/examples/butterfly.fio	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,19 @@
+# Perform a butterfly/funnel seek pattern. This won't always alternate ends on
+# every I/O but it will get close.
+
+[global]
+filename=/tmp/testfile
+bs=4k
+direct=1
+
+[forward]
+rw=read
+flow=2
+# Uncomment the size= and offset= lines to prevent each direction going past
+# the middle of the file
+#size=50%
+
+[backward]
+rw=read:-8k
+flow=-2
+#offset=50%
diff -Nru fio-2.16/examples/ftruncate.fio fio-3.1/examples/ftruncate.fio
--- fio-2.16/examples/ftruncate.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/examples/ftruncate.fio	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,27 @@
+# Example ftruncate engine jobs
+
+[global]
+ioengine=ftruncate
+directory=/scratch
+size=102404k ; 100Mb+4k
+stonewall
+filename=truncate
+runtime=10s
+time_based
+direct=1
+#
+# bs option is stub here. Truncation is performed on the current block offset.
+# blocksize value is ignored
+bs=4k
+
+# truncate the file to 4Kbytes then repeatedly grow the file back to just over
+# its original size using subsequent truncates
+[grow-truncate]
+rw=write
+
+# Repeatedly change a file to a random size between 0Kbytes and 100Mb
+# using truncates
+[rand-truncate]
+rw=randwrite
+norandommap
+
diff -Nru fio-2.16/examples/gpudirect-rdmaio-client.fio fio-3.1/examples/gpudirect-rdmaio-client.fio
--- fio-2.16/examples/gpudirect-rdmaio-client.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/examples/gpudirect-rdmaio-client.fio	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,15 @@
+# Example gpudirect rdma client job
+[global]
+ioengine=rdma
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[sender]
+rw=write
+iodepth=1
+iodepth_batch_complete=1
diff -Nru fio-2.16/examples/gpudirect-rdmaio-server.fio fio-3.1/examples/gpudirect-rdmaio-server.fio
--- fio-2.16/examples/gpudirect-rdmaio-server.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/examples/gpudirect-rdmaio-server.fio	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,12 @@
+# Example rdma server job
+[global]
+ioengine=rdma
+port=[port]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[receiver]
+rw=read
+iodepth=16
diff -Nru fio-2.16/examples/mtd.fio fio-3.1/examples/mtd.fio
--- fio-2.16/examples/mtd.fio	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/examples/mtd.fio	2017-09-28 10:23:20.000000000 +0000
@@ -17,5 +17,5 @@
 [write]
 stonewall
 block_error_percentiles=1
-rw=writetrim
+rw=trimwrite
 loops=4
diff -Nru fio-2.16/exp/README.md fio-3.1/exp/README.md
--- fio-2.16/exp/README.md	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/exp/README.md	1970-01-01 00:00:00.000000000 +0000
@@ -1,7 +0,0 @@
-simple-expression-parser
-========================
-
-A simple expression parser for arithmetic expressions made with bison + flex
-
-To use, see the example test-expression-parser.c
-
diff -Nru fio-2.16/file.h fio-3.1/file.h
--- fio-2.16/file.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/file.h	2017-09-28 10:23:20.000000000 +0000
@@ -15,7 +15,7 @@
  */
 enum fio_filetype {
 	FIO_TYPE_FILE = 1,		/* plain file */
-	FIO_TYPE_BD,			/* block device */
+	FIO_TYPE_BLOCK,			/* block device */
 	FIO_TYPE_CHAR,			/* character device */
 	FIO_TYPE_PIPE,			/* pipe */
 };
@@ -63,6 +63,7 @@
 	FIO_FALLOCATE_NONE	= 1,
 	FIO_FALLOCATE_POSIX	= 2,
 	FIO_FALLOCATE_KEEP_SIZE	= 3,
+	FIO_FALLOCATE_NATIVE	= 4,
 };
 
 /*
@@ -90,6 +91,7 @@
 
 	/*
 	 * size of the file, offset into file, and io size from that offset
+	 * (be aware io_size is different from thread_options::io_size)
 	 */
 	uint64_t real_file_size;
 	uint64_t file_offset;
@@ -112,9 +114,12 @@
 	unsigned int last_write_idx;
 
 	/*
-	 * For use by the io engine
+	 * For use by the io engine for offset or private data storage
 	 */
-	uint64_t engine_data;
+	union {
+		uint64_t engine_pos;
+		void *engine_data;
+	};
 
 	/*
 	 * if io is protected by a semaphore, this is set
@@ -146,14 +151,8 @@
 	struct disk_util *du;
 };
 
-#define FILE_ENG_DATA(f)	((void *) (uintptr_t) (f)->engine_data)
-#define FILE_SET_ENG_DATA(f, data)	\
-	((f)->engine_data = (uintptr_t) (data))
-
-struct file_name {
-	struct flist_head list;
-	char *filename;
-};
+#define FILE_ENG_DATA(f)		((f)->engine_data)
+#define FILE_SET_ENG_DATA(f, data)	((f)->engine_data = (data))
 
 #define FILE_FLAG_FNS(name)						\
 static inline void fio_file_set_##name(struct fio_file *f)		\
@@ -212,5 +211,6 @@
 extern void fio_file_reset(struct thread_data *, struct fio_file *);
 extern bool fio_files_done(struct thread_data *);
 extern bool exists_and_not_regfile(const char *);
+extern int fio_set_directio(struct thread_data *, struct fio_file *);
 
 #endif
diff -Nru fio-2.16/filesetup.c fio-3.1/filesetup.c
--- fio-2.16/filesetup.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/filesetup.c	2017-09-28 10:23:20.000000000 +0000
@@ -24,18 +24,90 @@
 
 static FLIST_HEAD(filename_list);
 
+/*
+ * List entry for filename_list
+ */
+struct file_name {
+	struct flist_head list;
+	char *filename;
+};
+
 static inline void clear_error(struct thread_data *td)
 {
 	td->error = 0;
 	td->verror[0] = '\0';
 }
 
+static inline int native_fallocate(struct thread_data *td, struct fio_file *f)
+{
+	bool success;
+
+	success = fio_fallocate(f, 0, f->real_file_size);
+	dprint(FD_FILE, "native fallocate of file %s size %llu was "
+			"%ssuccessful\n", f->file_name,
+			(unsigned long long) f->real_file_size,
+			!success ? "un": "");
+
+	if (success)
+		return 0;
+
+	if (errno == ENOSYS)
+		dprint(FD_FILE, "native fallocate is not implemented\n");
+
+	return -1;
+}
+
+static void fallocate_file(struct thread_data *td, struct fio_file *f)
+{
+	int r;
+
+	if (td->o.fill_device)
+		return;
+
+	switch (td->o.fallocate_mode) {
+	case FIO_FALLOCATE_NATIVE:
+		r = native_fallocate(td, f);
+		if (r != 0 && errno != ENOSYS)
+			log_err("fio: native_fallocate call failed: %s\n",
+					strerror(errno));
+		break;
+	case FIO_FALLOCATE_NONE:
+		break;
+#ifdef CONFIG_POSIX_FALLOCATE
+	case FIO_FALLOCATE_POSIX:
+		dprint(FD_FILE, "posix_fallocate file %s size %llu\n",
+				 f->file_name,
+				 (unsigned long long) f->real_file_size);
+
+		r = posix_fallocate(f->fd, 0, f->real_file_size);
+		if (r > 0)
+			log_err("fio: posix_fallocate fails: %s\n", strerror(r));
+		break;
+#endif /* CONFIG_POSIX_FALLOCATE */
+#ifdef CONFIG_LINUX_FALLOCATE
+	case FIO_FALLOCATE_KEEP_SIZE:
+		dprint(FD_FILE, "fallocate(FALLOC_FL_KEEP_SIZE) "
+				"file %s size %llu\n", f->file_name,
+				(unsigned long long) f->real_file_size);
+
+		r = fallocate(f->fd, FALLOC_FL_KEEP_SIZE, 0, f->real_file_size);
+		if (r != 0)
+			td_verror(td, errno, "fallocate");
+
+		break;
+#endif /* CONFIG_LINUX_FALLOCATE */
+	default:
+		log_err("fio: unknown fallocate mode: %d\n", td->o.fallocate_mode);
+		assert(0);
+	}
+}
+
 /*
  * Leaves f->fd open on success, caller must close
  */
 static int extend_file(struct thread_data *td, struct fio_file *f)
 {
-	int r, new_layout = 0, unlink_file = 0, flags;
+	int new_layout = 0, unlink_file = 0, flags;
 	unsigned long long left;
 	unsigned int bs;
 	char *b = NULL;
@@ -92,44 +164,11 @@
 		return 1;
 	}
 
-#ifdef CONFIG_POSIX_FALLOCATE
-	if (!td->o.fill_device) {
-		switch (td->o.fallocate_mode) {
-		case FIO_FALLOCATE_NONE:
-			break;
-		case FIO_FALLOCATE_POSIX:
-			dprint(FD_FILE, "posix_fallocate file %s size %llu\n",
-				 f->file_name,
-				 (unsigned long long) f->real_file_size);
-
-			r = posix_fallocate(f->fd, 0, f->real_file_size);
-			if (r > 0) {
-				log_err("fio: posix_fallocate fails: %s\n",
-						strerror(r));
-			}
-			break;
-#ifdef CONFIG_LINUX_FALLOCATE
-		case FIO_FALLOCATE_KEEP_SIZE:
-			dprint(FD_FILE,
-				"fallocate(FALLOC_FL_KEEP_SIZE) "
-				"file %s size %llu\n", f->file_name,
-				(unsigned long long) f->real_file_size);
-
-			r = fallocate(f->fd, FALLOC_FL_KEEP_SIZE, 0,
-					f->real_file_size);
-			if (r != 0)
-				td_verror(td, errno, "fallocate");
-
-			break;
-#endif /* CONFIG_LINUX_FALLOCATE */
-		default:
-			log_err("fio: unknown fallocate mode: %d\n",
-				td->o.fallocate_mode);
-			assert(0);
-		}
-	}
-#endif /* CONFIG_POSIX_FALLOCATE */
+	fallocate_file(td, f);
 
+	/*
+	 * If our jobs don't require regular files initially, we're done.
+	 */
 	if (!new_layout)
 		goto done;
 
@@ -148,11 +187,20 @@
 		}
 	}
 
-	b = malloc(td->o.max_bs[DDIR_WRITE]);
-
 	left = f->real_file_size;
+	bs = td->o.max_bs[DDIR_WRITE];
+	if (bs > left)
+		bs = left;
+
+	b = malloc(bs);
+	if (!b) {
+		td_verror(td, errno, "malloc");
+		goto err;
+	}
+
 	while (left && !td->terminate) {
-		bs = td->o.max_bs[DDIR_WRITE];
+		ssize_t r;
+
 		if (bs > left)
 			bs = left;
 
@@ -217,7 +265,11 @@
 	unsigned int bs;
 	char *b;
 
-	if (td_ioengine_flagged(td, FIO_PIPEIO))
+	if (td_ioengine_flagged(td, FIO_PIPEIO) ||
+	    td_ioengine_flagged(td, FIO_NOIO))
+		return 0;
+
+	if (f->filetype == FIO_TYPE_CHAR)
 		return 0;
 
 	if (!fio_file_open(f)) {
@@ -230,8 +282,17 @@
 
 	old_runstate = td_bump_runstate(td, TD_PRE_READING);
 
+	left = f->io_size;
 	bs = td->o.max_bs[DDIR_READ];
+	if (bs > left)
+		bs = left;
+
 	b = malloc(bs);
+	if (!b) {
+		td_verror(td, errno, "malloc");
+		ret = 1;
+		goto error;
+	}
 	memset(b, 0, bs);
 
 	if (lseek(f->fd, f->file_offset, SEEK_SET) < 0) {
@@ -241,8 +302,6 @@
 		goto error;
 	}
 
-	left = f->io_size;
-
 	while (left && !td->terminate) {
 		if (bs > left)
 			bs = left;
@@ -370,16 +429,38 @@
 
 	if (f->filetype == FIO_TYPE_FILE)
 		ret = file_size(td, f);
-	else if (f->filetype == FIO_TYPE_BD)
+	else if (f->filetype == FIO_TYPE_BLOCK)
 		ret = bdev_size(td, f);
 	else if (f->filetype == FIO_TYPE_CHAR)
 		ret = char_size(td, f);
 	else
-		f->real_file_size = -1;
+		f->real_file_size = -1ULL;
 
+	/*
+	 * Leave ->real_file_size with 0 since it could be expectation
+	 * of initial setup for regular files.
+	 */
 	if (ret)
 		return ret;
 
+	/*
+	 * If ->real_file_size is -1, a conditional for the message
+	 * "offset extends end" is always true, but it makes no sense,
+	 * so just return the same value here.
+	 */
+	if (f->real_file_size == -1ULL) {
+		log_info("%s: failed to get file size of %s\n", td->o.name,
+					f->file_name);
+		return 1;
+	}
+
+	if (td->o.start_offset && f->file_offset == 0)
+		dprint(FD_FILE, "offset of file %s not initialized yet\n",
+					f->file_name);
+	/*
+	 * ->file_offset normally hasn't been initialized yet, so this
+	 * is basically always false.
+	 */
 	if (f->file_offset > f->real_file_size) {
 		log_err("%s: offset extends end (%llu > %llu)\n", td->o.name,
 					(unsigned long long) f->file_offset,
@@ -409,20 +490,22 @@
 	if (len == -1ULL || off == -1ULL)
 		return 0;
 
-	dprint(FD_IO, "invalidate cache %s: %llu/%llu\n", f->file_name, off,
-								len);
-
 	if (td->io_ops->invalidate) {
+		dprint(FD_IO, "invalidate %s cache %s\n", td->io_ops->name,
+			f->file_name);
 		ret = td->io_ops->invalidate(td, f);
 		if (ret < 0)
-			errval = ret;
+			errval = -ret;
 	} else if (f->filetype == FIO_TYPE_FILE) {
+		dprint(FD_IO, "declare unneeded cache %s: %llu/%llu\n",
+			f->file_name, off, len);
 		ret = posix_fadvise(f->fd, off, len, POSIX_FADV_DONTNEED);
 		if (ret)
 			errval = ret;
-	} else if (f->filetype == FIO_TYPE_BD) {
+	} else if (f->filetype == FIO_TYPE_BLOCK) {
 		int retry_count = 0;
 
+		dprint(FD_IO, "drop page cache %s\n", f->file_name);
 		ret = blockdev_invalidate_cache(f);
 		while (ret < 0 && errno == EAGAIN && retry_count++ < 25) {
 			/*
@@ -444,8 +527,11 @@
 		}
 		if (ret < 0)
 			errval = errno;
-	} else if (f->filetype == FIO_TYPE_CHAR || f->filetype == FIO_TYPE_PIPE)
+	} else if (f->filetype == FIO_TYPE_CHAR ||
+		   f->filetype == FIO_TYPE_PIPE) {
+		dprint(FD_IO, "invalidate not supported %s\n", f->file_name);
 		ret = 0;
+	}
 
 	/*
 	 * Cache flushing isn't a fatal condition, and we know it will
@@ -454,7 +540,8 @@
 	 * continue on our way.
 	 */
 	if (errval)
-		log_info("fio: cache invalidation of %s failed: %s\n", f->file_name, strerror(errval));
+		log_info("fio: cache invalidation of %s failed: %s\n",
+			 f->file_name, strerror(errval));
 
 	return 0;
 
@@ -486,7 +573,7 @@
 		f->shadow_fd = -1;
 	}
 
-	f->engine_data = 0;
+	f->engine_pos = 0;
 	return ret;
 }
 
@@ -498,9 +585,6 @@
 	__f = lookup_file_hash(f->file_name);
 	if (__f) {
 		dprint(FD_FILE, "found file in hash %s\n", f->file_name);
-		/*
-		 * racy, need the __f->lock locked
-		 */
 		f->lock = __f->lock;
 		from_hash = 1;
 	} else {
@@ -597,7 +681,8 @@
 			f->fd = dup(STDIN_FILENO);
 		else
 			from_hash = file_lookup_open(f, flags);
-	} else { //td trim
+	} else if (td_trim(td)) {
+		assert(!td_rw(td)); /* should have matched above */
 		flags |= O_RDWR;
 		from_hash = file_lookup_open(f, flags);
 	}
@@ -652,6 +737,10 @@
 	return 0;
 }
 
+/*
+ * This function i.e. get_file_size() is the default .get_file_size
+ * implementation of majority of I/O engines.
+ */
 int generic_get_file_size(struct thread_data *td, struct fio_file *f)
 {
 	return get_file_size(td, f);
@@ -667,7 +756,7 @@
 	int err = 0;
 
 	for_each_file(td, f, i) {
-		dprint(FD_FILE, "get file size for %p/%d/%p\n", f, i,
+		dprint(FD_FILE, "get file size for %p/%d/%s\n", f, i,
 								f->file_name);
 
 		if (td_io_get_file_size(td, f)) {
@@ -679,6 +768,13 @@
 			clear_error(td);
 		}
 
+		/*
+		 * There are corner cases where we end up with -1 for
+		 * ->real_file_size due to unsupported file type, etc.
+		 * We then just set to size option value divided by number
+		 * of files, similar to the way file ->io_size is set.
+		 * stat(2) failure doesn't set ->real_file_size to -1.
+		 */
 		if (f->real_file_size == -1ULL && td->o.size)
 			f->real_file_size = td->o.size / td->o.nr_files;
 	}
@@ -709,7 +805,7 @@
 		struct stat sb;
 		char buf[256];
 
-		if (f->filetype == FIO_TYPE_BD || f->filetype == FIO_TYPE_CHAR) {
+		if (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_CHAR) {
 			if (f->real_file_size != -1ULL)
 				ret += f->real_file_size;
 			continue;
@@ -765,12 +861,42 @@
 uint64_t get_start_offset(struct thread_data *td, struct fio_file *f)
 {
 	struct thread_options *o = &td->o;
+	unsigned long long align_bs;
+	unsigned long long offset;
 
 	if (o->file_append && f->filetype == FIO_TYPE_FILE)
 		return f->real_file_size;
 
-	return td->o.start_offset +
-		td->subjob_number * td->o.offset_increment;
+	if (o->start_offset_percent > 0) {
+		/*
+		 * if blockalign is provided, find the min across read, write,
+		 * and trim
+		 */
+		if (fio_option_is_set(o, ba)) {
+			align_bs = (unsigned long long) min(o->ba[DDIR_READ], o->ba[DDIR_WRITE]);
+			align_bs = min((unsigned long long) o->ba[DDIR_TRIM], align_bs);
+		} else {
+			/* else take the minimum block size */
+			align_bs = td_min_bs(td);
+		}
+
+		/* calculate the raw offset */
+		offset = (f->real_file_size * o->start_offset_percent / 100) +
+			(td->subjob_number * o->offset_increment);
+
+		/*
+		 * block align the offset at the next available boundary at
+		 * ceiling(offset / align_bs) * align_bs
+		 */
+		offset = (offset / align_bs + (offset % align_bs != 0)) * align_bs;
+
+	} else {
+		/* start_offset_percent not set */
+		offset = o->start_offset +
+				td->subjob_number * o->offset_increment;
+	}
+
+	return offset;
 }
 
 /*
@@ -795,7 +921,9 @@
 		goto done;
 
 	/*
-	 * if ioengine defines a setup() method, it's responsible for
+	 * Find out physical size of files or devices for this thread,
+	 * before we determine I/O size and range of our targets.
+	 * If ioengine defines a setup() method, it's responsible for
 	 * opening the files and setting f->real_file_size to indicate
 	 * the valid range for that file.
 	 */
@@ -836,7 +964,7 @@
 
 	/*
 	 * Calculate per-file size and potential extra size for the
-	 * first files, if needed.
+	 * first files, if needed (i.e. if we don't have a fixed size).
 	 */
 	if (!o->file_size_low && o->nr_files) {
 		uint64_t all_fs;
@@ -858,11 +986,18 @@
 	for_each_file(td, f, i) {
 		f->file_offset = get_start_offset(td, f);
 
+		/*
+		 * Update ->io_size depending on options specified.
+		 * ->file_size_low being 0 means filesize option isn't set.
+		 * Non zero ->file_size_low equals ->file_size_high means
+		 * filesize option is set in a fixed size format.
+		 * Non zero ->file_size_low not equals ->file_size_high means
+		 * filesize option is set in a range format.
+		 */
 		if (!o->file_size_low) {
 			/*
-			 * no file size range given, file size is equal to
-			 * total size divided by number of files. If that is
-			 * zero, set it to the real file size. If the size
+			 * no file size or range given, file size is equal to
+			 * total size divided by number of files. If the size
 			 * doesn't divide nicely with the min blocksize,
 			 * make the first files bigger.
 			 */
@@ -872,8 +1007,22 @@
 				f->io_size += bs;
 			}
 
-			if (!f->io_size)
+			/*
+			 * We normally don't come here for regular files, but
+			 * if the result is 0 for a regular file, set it to the
+			 * real file size. This could be size of the existing
+			 * one if it already exists, but otherwise will be set
+			 * to 0. A new file won't be created because
+			 * ->io_size + ->file_offset equals ->real_file_size.
+			 */
+			if (!f->io_size) {
+				if (f->file_offset > f->real_file_size)
+					goto err_offset;
 				f->io_size = f->real_file_size - f->file_offset;
+				if (!f->io_size)
+					log_info("fio: file %s may be ignored\n",
+						f->file_name);
+			}
 		} else if (f->real_file_size < o->file_size_low ||
 			   f->real_file_size > o->file_size_high) {
 			if (f->file_offset > o->file_size_low)
@@ -895,7 +1044,14 @@
 			total_size = -1ULL;
 		else {
                         if (o->size_percent) {
-				f->io_size = (f->io_size * o->size_percent) / 100;
+				uint64_t file_size;
+
+				file_size = f->io_size + f->file_offset;
+				f->io_size = (file_size *
+					      o->size_percent) / 100;
+				if (f->io_size > (file_size - f->file_offset))
+					f->io_size = file_size - f->file_offset;
+
 				f->io_size -= (f->io_size % td_min_bs(td));
 			}
 			total_size += f->io_size;
@@ -907,9 +1063,9 @@
 			if (!o->create_on_open) {
 				need_extend++;
 				extend_size += (f->io_size + f->file_offset);
+				fio_file_set_extend(f);
 			} else
 				f->real_file_size = f->io_size + f->file_offset;
-			fio_file_set_extend(f);
 		}
 	}
 
@@ -943,14 +1099,21 @@
 	}
 
 	/*
-	 * See if we need to extend some files
+	 * See if we need to extend some files, typically needed when our
+	 * target regular files don't exist yet, but our jobs require them
+	 * initially due to read I/Os.
 	 */
 	if (need_extend) {
 		temp_stall_ts = 1;
-		if (output_format & FIO_OUTPUT_NORMAL)
-			log_info("%s: Laying out IO file(s) (%u file(s) /"
-				 " %lluMB)\n", o->name, need_extend,
-					extend_size >> 20);
+		if (output_format & FIO_OUTPUT_NORMAL) {
+			log_info("%s: Laying out IO file%s (%u file%s / %s%lluMiB)\n",
+				 o->name,
+				 need_extend > 1 ? "s" : "",
+				 need_extend,
+				 need_extend > 1 ? "s" : "",
+				 need_extend > 1 ? "total " : "",
+				 extend_size >> 20);
+		}
 
 		for_each_file(td, f, i) {
 			unsigned long long old_len = -1ULL, extend_len = -1ULL;
@@ -997,8 +1160,8 @@
 	 * stored entries.
 	 */
 	if (!o->read_iolog_file) {
-		if (o->io_limit)
-			td->total_io_size = o->io_limit * o->loops;
+		if (o->io_size)
+			td->total_io_size = o->io_size * o->loops;
 		else
 			td->total_io_size = o->size * o->loops;
 	}
@@ -1024,10 +1187,11 @@
 	dprint(FD_FILE, "pre_read files\n");
 
 	for_each_file(td, f, i) {
-		pre_read_file(td, f);
+		if (pre_read_file(td, f))
+			return -1;
 	}
 
-	return 1;
+	return 0;
 }
 
 static int __init_rand_distribution(struct thread_data *td, struct fio_file *f)
@@ -1229,12 +1393,12 @@
 	/* \\.\ is the device namespace in Windows, where every file is
 	 * a block device */
 	if (strncmp(f->file_name, "\\\\.\\", 4) == 0)
-		f->filetype = FIO_TYPE_BD;
+		f->filetype = FIO_TYPE_BLOCK;
 #endif
 
 	if (!stat(f->file_name, &sb)) {
 		if (S_ISBLK(sb.st_mode))
-			f->filetype = FIO_TYPE_BD;
+			f->filetype = FIO_TYPE_BLOCK;
 		else if (S_ISCHR(sb.st_mode))
 			f->filetype = FIO_TYPE_CHAR;
 		else if (S_ISFIFO(sb.st_mode))
@@ -1679,3 +1843,32 @@
 {
 	free_already_allocated();
 }
+
+/*
+ * This function is for platforms which support direct I/O but not O_DIRECT.
+ */
+int fio_set_directio(struct thread_data *td, struct fio_file *f)
+{
+#ifdef FIO_OS_DIRECTIO
+	int ret = fio_set_odirect(f);
+
+	if (ret) {
+		td_verror(td, ret, "fio_set_directio");
+#if defined(__sun__)
+		if (ret == ENOTTY) { /* ENOTTY suggests RAW device or ZFS */
+			log_err("fio: doing directIO to RAW devices or ZFS not supported\n");
+		} else {
+			log_err("fio: the file system does not seem to support direct IO\n");
+		}
+#else
+		log_err("fio: the file system does not seem to support direct IO\n");
+#endif
+		return -1;
+	}
+
+	return 0;
+#else
+	log_err("fio: direct IO is not supported on this host operating system\n");
+	return -1;
+#endif
+}
diff -Nru fio-2.16/fio.1 fio-3.1/fio.1
--- fio-2.16/fio.1	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/fio.1	2017-09-28 10:23:20.000000000 +0000
@@ -1,4 +1,4 @@
-.TH fio 1 "December 2014" "User Manual"
+.TH fio 1 "August 2017" "User Manual"
 .SH NAME
 fio \- flexible I/O tester
 .SH SYNOPSIS
@@ -13,217 +13,549 @@
 .SH OPTIONS
 .TP
 .BI \-\-debug \fR=\fPtype
-Enable verbose tracing of various fio actions. May be `all' for all types
-or individual types separated by a comma (eg \-\-debug=io,file). `help' will
-list all available tracing options.
+Enable verbose tracing \fItype\fR of various fio actions. May be `all' for all \fItype\fRs
+or individual types separated by a comma (e.g. `\-\-debug=file,mem' will enable
+file and memory debugging). `help' will list all available tracing options.
+.TP
+.BI \-\-parse\-only
+Parse options only, don't start any I/O.
 .TP
 .BI \-\-output \fR=\fPfilename
 Write output to \fIfilename\fR.
 .TP
-.BI \-\-output-format \fR=\fPformat
-Set the reporting format to \fInormal\fR, \fIterse\fR, \fIjson\fR, or
-\fIjson+\fR. Multiple formats can be selected, separate by a comma. \fIterse\fR
-is a CSV based format. \fIjson+\fR is like \fIjson\fR, except it adds a full
+.BI \-\-output\-format \fR=\fPformat
+Set the reporting \fIformat\fR to `normal', `terse', `json', or
+`json+'. Multiple formats can be selected, separate by a comma. `terse'
+is a CSV based format. `json+' is like `json', except it adds a full
 dump of the latency buckets.
 .TP
-.BI \-\-runtime \fR=\fPruntime
-Limit run time to \fIruntime\fR seconds.
-.TP
-.B \-\-bandwidth\-log
+.BI \-\-bandwidth\-log
 Generate aggregate bandwidth logs.
 .TP
-.B \-\-minimal
-Print statistics in a terse, semicolon-delimited format.
+.BI \-\-minimal
+Print statistics in a terse, semicolon\-delimited format.
 .TP
-.B \-\-append-terse
-Print statistics in selected mode AND terse, semicolon-delimited format.
-Deprecated, use \-\-output-format instead to select multiple formats.
-.TP
-.B \-\-version
-Display version information and exit.
+.BI \-\-append\-terse
+Print statistics in selected mode AND terse, semicolon\-delimited format.
+\fBDeprecated\fR, use \fB\-\-output\-format\fR instead to select multiple formats.
 .TP
 .BI \-\-terse\-version \fR=\fPversion
-Set terse version output format (Current version 3, or older version 2).
+Set terse \fIversion\fR output format (default `3', or `2', `4', `5').
+.TP
+.BI \-\-version
+Print version information and exit.
 .TP
-.B \-\-help
-Display usage information and exit.
+.BI \-\-help
+Print a summary of the command line options and exit.
 .TP
-.B \-\-cpuclock-test
-Perform test and validation of internal CPU clock
+.BI \-\-cpuclock\-test
+Perform test and validation of internal CPU clock.
 .TP
-.BI \-\-crctest[\fR=\fPtest]
-Test the speed of the builtin checksumming functions. If no argument is given,
-all of them are tested. Or a comma separated list can be passed, in which
+.BI \-\-crctest \fR=\fP[test]
+Test the speed of the built\-in checksumming functions. If no argument is given,
+all of them are tested. Alternatively, a comma separated list can be passed, in which
 case the given ones are tested.
 .TP
 .BI \-\-cmdhelp \fR=\fPcommand
-Print help information for \fIcommand\fR.  May be `all' for all commands.
+Print help information for \fIcommand\fR. May be `all' for all commands.
 .TP
-.BI \-\-enghelp \fR=\fPioengine[,command]
-List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR defined by \fIioengine\fR.
+.BI \-\-enghelp \fR=\fP[ioengine[,command]]
+List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR
+defined by \fIioengine\fR. If no \fIioengine\fR is given, list all
+available ioengines.
 .TP
 .BI \-\-showcmd \fR=\fPjobfile
-Convert \fIjobfile\fR to a set of command-line options.
+Convert \fIjobfile\fR to a set of command\-line options.
+.TP
+.BI \-\-readonly
+Turn on safety read\-only checks, preventing writes. The \fB\-\-readonly\fR
+option is an extra safety guard to prevent users from accidentally starting
+a write workload when that is not desired. Fio will only write if
+`rw=write/randwrite/rw/randrw' is given. This extra safety net can be used
+as an extra precaution as \fB\-\-readonly\fR will also enable a write check in
+the I/O engine core to prevent writes due to unknown user space bug(s).
 .TP
 .BI \-\-eta \fR=\fPwhen
-Specifies when real-time ETA estimate should be printed.  \fIwhen\fR may
-be one of `always', `never' or `auto'.
+Specifies when real\-time ETA estimate should be printed. \fIwhen\fR may
+be `always', `never' or `auto'.
 .TP
 .BI \-\-eta\-newline \fR=\fPtime
-Force an ETA newline for every `time` period passed.
+Force a new line for every \fItime\fR period passed. When the unit is omitted,
+the value is interpreted in seconds.
 .TP
 .BI \-\-status\-interval \fR=\fPtime
-Report full output status every `time` period passed.
-.TP
-.BI \-\-readonly
-Turn on safety read-only checks, preventing any attempted write.
-.TP
-.BI \-\-section \fR=\fPsec
-Only run section \fIsec\fR from job file. This option can be used multiple times to add more sections to run.
+Force a full status dump of cumulative (from job start) values at \fItime\fR
+intervals. This option does *not* provide per-period measurements. So
+values such as bandwidth are running averages. When the time unit is omitted,
+\fItime\fR is interpreted in seconds.
+.TP
+.BI \-\-section \fR=\fPname
+Only run specified section \fIname\fR in job file. Multiple sections can be specified.
+The \fB\-\-section\fR option allows one to combine related jobs into one file.
+E.g. one job file could define light, moderate, and heavy sections. Tell
+fio to run only the "heavy" section by giving `\-\-section=heavy'
+command line option. One can also specify the "write" operations in one
+section and "verify" operation in another section. The \fB\-\-section\fR option
+only applies to job sections. The reserved *global* section is always
+parsed and used.
 .TP
 .BI \-\-alloc\-size \fR=\fPkb
-Set the internal smalloc pool size to \fIkb\fP kilobytes.
+Set the internal smalloc pool size to \fIkb\fR in KiB. The
+\fB\-\-alloc\-size\fR switch allows one to use a larger pool size for smalloc.
+If running large jobs with randommap enabled, fio can run out of memory.
+Smalloc is an internal allocator for shared structures from a fixed size
+memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
+NOTE: While running `.fio_smalloc.*' backing store files are visible
+in `/tmp'.
 .TP
 .BI \-\-warnings\-fatal
 All fio parser warnings are fatal, causing fio to exit with an error.
 .TP
 .BI \-\-max\-jobs \fR=\fPnr
-Set the maximum allowed number of jobs (threads/processes) to support.
+Set the maximum number of threads/processes to support to \fInr\fR.
 .TP
 .BI \-\-server \fR=\fPargs
-Start a backend server, with \fIargs\fP specifying what to listen to. See client/server section.
+Start a backend server, with \fIargs\fR specifying what to listen to.
+See \fBCLIENT/SERVER\fR section.
 .TP
 .BI \-\-daemonize \fR=\fPpidfile
-Background a fio server, writing the pid to the given pid file.
+Background a fio server, writing the pid to the given \fIpidfile\fR file.
+.TP
+.BI \-\-client \fR=\fPhostname
+Instead of running the jobs locally, send and run them on the given \fIhostname\fR
+or set of \fIhostname\fRs. See \fBCLIENT/SERVER\fR section.
 .TP
-.BI \-\-client \fR=\fPhost
-Instead of running the jobs locally, send and run them on the given host or set of hosts.  See client/server section.
+.BI \-\-remote\-config \fR=\fPfile
+Tell fio server to load this local \fIfile\fR.
 .TP
 .BI \-\-idle\-prof \fR=\fPoption
-Report cpu idleness on a system or percpu basis (\fIoption\fP=system,percpu) or run unit work calibration only (\fIoption\fP=calibrate).
-.SH "JOB FILE FORMAT"
-Job files are in `ini' format. They consist of one or more
-job definitions, which begin with a job name in square brackets and
-extend to the next job name.  The job name can be any ASCII string
-except `global', which has a special meaning.  Following the job name is
-a sequence of zero or more parameters, one per line, that define the
-behavior of the job.  Any line starting with a `;' or `#' character is
-considered a comment and ignored.
-.P
-If \fIjobfile\fR is specified as `-', the job file will be read from
-standard input.
-.SS "Global Section"
-The global section contains default parameters for jobs specified in the
-job file.  A job is only affected by global sections residing above it,
-and there may be any number of global sections.  Specific job definitions
-may override any parameter set in global sections.
-.SH "JOB PARAMETERS"
-.SS Types
-Some parameters may take arguments of a specific type.
-Anywhere a numeric value is required, an arithmetic expression may be used,
-provided it is surrounded by parentheses. Supported operators are:
+Report CPU idleness. \fIoption\fR is one of the following:
 .RS
 .RS
 .TP
-.B addition (+)
+.B calibrate
+Run unit work calibration only and exit.
 .TP
-.B subtraction (-)
+.B system
+Show aggregate system idleness and unit work.
 .TP
-.B multiplication (*)
+.B percpu
+As \fBsystem\fR but also show per CPU idleness.
+.RE
+.RE
 .TP
-.B division (/)
+.BI \-\-inflate\-log \fR=\fPlog
+Inflate and output compressed \fIlog\fR.
 .TP
-.B modulus (%)
+.BI \-\-trigger\-file \fR=\fPfile
+Execute trigger command when \fIfile\fR exists.
+.TP
+.BI \-\-trigger\-timeout \fR=\fPtime
+Execute trigger at this \fItime\fR.
+.TP
+.BI \-\-trigger \fR=\fPcommand
+Set this \fIcommand\fR as local trigger.
 .TP
+.BI \-\-trigger\-remote \fR=\fPcommand
+Set this \fIcommand\fR as remote trigger.
+.TP
+.BI \-\-aux\-path \fR=\fPpath
+Use this \fIpath\fR for fio state generated files.
+.SH "JOB FILE FORMAT"
+Any parameters following the options will be assumed to be job files, unless
+they match a job file parameter. Multiple job files can be listed and each job
+file will be regarded as a separate group. Fio will \fBstonewall\fR execution
+between each group.
+
+Fio accepts one or more job files describing what it is
+supposed to do. The job file format is the classic ini file, where the names
+enclosed in [] brackets define the job name. You are free to use any ASCII name
+you want, except *global* which has special meaning. Following the job name is
+a sequence of zero or more parameters, one per line, that define the behavior of
+the job. If the first character in a line is a ';' or a '#', the entire line is
+discarded as a comment.
+
+A *global* section sets defaults for the jobs described in that file. A job may
+override a *global* section parameter, and a job file may even have several
+*global* sections if so desired. A job is only affected by a *global* section
+residing above it.
+
+The \fB\-\-cmdhelp\fR option also lists all options. If used with an \fIcommand\fR
+argument, \fB\-\-cmdhelp\fR will detail the given \fIcommand\fR.
+
+See the `examples/' directory for inspiration on how to write job files. Note
+the copyright and license requirements currently apply to
+`examples/' files.
+.SH "JOB FILE PARAMETERS"
+Some parameters take an option of a given type, such as an integer or a
+string. Anywhere a numeric value is required, an arithmetic expression may be
+used, provided it is surrounded by parentheses. Supported operators are:
+.RS
+.P
+.B addition (+)
+.P
+.B subtraction (\-)
+.P
+.B multiplication (*)
+.P
+.B division (/)
+.P
+.B modulus (%)
+.P
 .B exponentiation (^)
 .RE
-.RE
 .P
 For time values in expressions, units are microseconds by default. This is
 different than for time values not in expressions (not enclosed in
-parentheses). The types used are:
+parentheses).
+.SH "PARAMETER TYPES"
+The following parameter types are used.
 .TP
 .I str
-String: a sequence of alphanumeric characters.
+String. A sequence of alphanumeric characters.
+.TP
+.I time
+Integer with possible time suffix. Without a unit value is interpreted as
+seconds unless otherwise specified. Accepts a suffix of 'd' for days, 'h' for
+hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and 'us'
+(or 'usec') for microseconds. For example, use 10m for 10 minutes.
 .TP
 .I int
-SI integer: a whole number, possibly containing a suffix denoting the base unit
-of the value.  Accepted suffixes are `k', 'M', 'G', 'T', and 'P', denoting
-kilo (1024), mega (1024^2), giga (1024^3), tera (1024^4), and peta (1024^5)
-respectively. If prefixed with '0x', the value is assumed to be base 16
-(hexadecimal). A suffix may include a trailing 'b', for instance 'kb' is
-identical to 'k'. You can specify a base 10 value by using 'KiB', 'MiB','GiB',
-etc. This is useful for disk drives where values are often given in base 10
-values. Specifying '30GiB' will get you 30*1000^3 bytes.
-When specifying times the default suffix meaning changes, still denoting the
-base unit of the value, but accepted suffixes are 'D' (days), 'H' (hours), 'M'
-(minutes), 'S' Seconds, 'ms' (or msec) milli seconds, 'us' (or 'usec') micro
-seconds. Time values without a unit specify seconds.
-The suffixes are not case sensitive.
+Integer. A whole number value, which may contain an integer prefix
+and an integer suffix.
+.RS
+.RS
+.P
+[*integer prefix*] **number** [*integer suffix*]
+.RE
+.P
+The optional *integer prefix* specifies the number's base. The default
+is decimal. *0x* specifies hexadecimal.
+.P
+The optional *integer suffix* specifies the number's units, and includes an
+optional unit prefix and an optional unit. For quantities of data, the
+default unit is bytes. For quantities of time, the default unit is seconds
+unless otherwise specified.
+.P
+With `kb_base=1000', fio follows international standards for unit
+prefixes. To specify power\-of\-10 decimal values defined in the
+International System of Units (SI):
+.RS
+.P
+.PD 0
+K means kilo (K) or 1000
+.P
+M means mega (M) or 1000**2
+.P
+G means giga (G) or 1000**3
+.P
+T means tera (T) or 1000**4
+.P
+P means peta (P) or 1000**5
+.PD
+.RE
+.P
+To specify power\-of\-2 binary values defined in IEC 80000\-13:
+.RS
+.P
+.PD 0
+Ki means kibi (Ki) or 1024
+.P
+Mi means mebi (Mi) or 1024**2
+.P
+Gi means gibi (Gi) or 1024**3
+.P
+Ti means tebi (Ti) or 1024**4
+.P
+Pi means pebi (Pi) or 1024**5
+.PD
+.RE
+.P
+With `kb_base=1024' (the default), the unit prefixes are opposite
+from those specified in the SI and IEC 80000\-13 standards to provide
+compatibility with old scripts. For example, 4k means 4096.
+.P
+For quantities of data, an optional unit of 'B' may be included
+(e.g., 'kB' is the same as 'k').
+.P
+The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
+not milli). 'b' and 'B' both mean byte, not bit.
+.P
+Examples with `kb_base=1000':
+.RS
+.P
+.PD 0
+4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+.P
+1 MiB: 1048576, 1m, 1024k
+.P
+1 MB: 1000000, 1mi, 1000ki
+.P
+1 TiB: 1073741824, 1t, 1024m, 1048576k
+.P
+1 TB: 1000000000, 1ti, 1000mi, 1000000ki
+.PD
+.RE
+.P
+Examples with `kb_base=1024' (default):
+.RS
+.P
+.PD 0
+4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+.P
+1 MiB: 1048576, 1m, 1024k
+.P
+1 MB: 1000000, 1mi, 1000ki
+.P
+1 TiB: 1073741824, 1t, 1024m, 1048576k
+.P
+1 TB: 1000000000, 1ti, 1000mi, 1000000ki
+.PD
+.RE
+.P
+To specify times (units are not case sensitive):
+.RS
+.P
+.PD 0
+D means days
+.P
+H means hours
+.P
+M mean minutes
+.P
+s or sec means seconds (default)
+.P
+ms or msec means milliseconds
+.P
+us or usec means microseconds
+.PD
+.RE
+.P
+If the option accepts an upper and lower range, use a colon ':' or
+minus '\-' to separate such values. See \fIirange\fR parameter type.
+If the lower value specified happens to be larger than the upper value
+the two values are swapped.
+.RE
 .TP
 .I bool
-Boolean: a true or false value. `0' denotes false, `1' denotes true.
+Boolean. Usually parsed as an integer, however only defined for
+true and false (1 and 0).
 .TP
 .I irange
-Integer range: a range of integers specified in the format
-\fIlower\fR:\fIupper\fR or \fIlower\fR\-\fIupper\fR. \fIlower\fR and
-\fIupper\fR may contain a suffix as described above.  If an option allows two
-sets of ranges, they are separated with a `,' or `/' character. For example:
-`8\-8k/8M\-4G'.
+Integer range with suffix. Allows value range to be given, such as
+1024\-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
+option allows two sets of ranges, they can be specified with a ',' or '/'
+delimiter: 1k\-4k/8k\-32k. Also see \fIint\fR parameter type.
 .TP
 .I float_list
-List of floating numbers: A list of floating numbers, separated by
-a ':' character.
-.SS "Parameter List"
+A list of floating point numbers, separated by a ':' character.
+.SH "JOB PARAMETERS"
+With the above in mind, here follows the complete list of fio job parameters.
+.SS "Units"
 .TP
-.BI name \fR=\fPstr
-May be used to override the job name.  On the command line, this parameter
-has the special purpose of signalling the start of a new job.
+.BI kb_base \fR=\fPint
+Select the interpretation of unit prefixes in input parameters.
+.RS
+.RS
 .TP
-.BI wait_for \fR=\fPstr
-Specifies the name of the already defined job to wait for. Single waitee name
-only may be specified. If set, the job won't be started until all workers of
-the waitee job are done.  Wait_for operates on the job name basis, so there are
-a few limitations. First, the waitee must be defined prior to the waiter job
-(meaning no forward references). Second, if a job is being referenced as a
-waitee, it must have a unique name (no duplicate waitees).
+.B 1000
+Inputs comply with IEC 80000\-13 and the International
+System of Units (SI). Use:
+.RS
+.P
+.PD 0
+\- power\-of\-2 values with IEC prefixes (e.g., KiB)
+.P
+\- power\-of\-10 values with SI prefixes (e.g., kB)
+.PD
+.RE
+.TP
+.B 1024
+Compatibility mode (default). To avoid breaking old scripts:
+.P
+.RS
+.PD 0
+\- power\-of\-2 values with SI prefixes
+.P
+\- power\-of\-10 values with IEC prefixes
+.PD
+.RE
+.RE
+.P
+See \fBbs\fR for more details on input parameters.
+.P
+Outputs always use correct prefixes. Most outputs include both
+side\-by\-side, like:
+.P
+.RS
+bw=2383.3kB/s (2327.4KiB/s)
+.RE
+.P
+If only one value is reported, then kb_base selects the one to use:
+.P
+.RS
+.PD 0
+1000 \-\- SI prefixes
+.P
+1024 \-\- IEC prefixes
+.PD
+.RE
+.RE
+.TP
+.BI unit_base \fR=\fPint
+Base unit for reporting. Allowed values are:
+.RS
+.RS
+.TP
+.B 0
+Use auto\-detection (default).
+.TP
+.B 8
+Byte based.
+.TP
+.B 1
+Bit based.
+.RE
+.RE
+.SS "Job description"
+.TP
+.BI name \fR=\fPstr
+ASCII name of the job. This may be used to override the name printed by fio
+for this job. Otherwise the job name is used. On the command line this
+parameter has the special purpose of also signaling the start of a new job.
 .TP
 .BI description \fR=\fPstr
-Human-readable description of the job. It is printed when the job is run, but
-otherwise has no special purpose.
+Text description of the job. Doesn't do anything except dump this text
+description when this job is run. It's not parsed.
+.TP
+.BI loops \fR=\fPint
+Run the specified number of iterations of this job. Used to repeat the same
+workload a given number of times. Defaults to 1.
+.TP
+.BI numjobs \fR=\fPint
+Create the specified number of clones of this job. Each clone of job
+is spawned as an independent thread or process. May be used to setup a
+larger number of threads/processes doing the same thing. Each thread is
+reported separately; to see statistics for all clones as a whole, use
+\fBgroup_reporting\fR in conjunction with \fBnew_group\fR.
+See \fB\-\-max\-jobs\fR. Default: 1.
+.SS "Time related parameters"
+.TP
+.BI runtime \fR=\fPtime
+Tell fio to terminate processing after the specified period of time. It
+can be quite hard to determine for how long a specified job will run, so
+this parameter is handy to cap the total runtime to a given time. When
+the unit is omitted, the value is intepreted in seconds.
+.TP
+.BI time_based
+If set, fio will run for the duration of the \fBruntime\fR specified
+even if the file(s) are completely read or written. It will simply loop over
+the same workload as many times as the \fBruntime\fR allows.
+.TP
+.BI startdelay \fR=\fPirange(int)
+Delay the start of job for the specified amount of time. Can be a single
+value or a range. When given as a range, each thread will choose a value
+randomly from within the range. Value is in seconds if a unit is omitted.
+.TP
+.BI ramp_time \fR=\fPtime
+If set, fio will run the specified workload for this amount of time before
+logging any performance numbers. Useful for letting performance settle
+before logging results, thus minimizing the runtime required for stable
+results. Note that the \fBramp_time\fR is considered lead in time for a job,
+thus it will increase the total runtime if a special timeout or
+\fBruntime\fR is specified. When the unit is omitted, the value is
+given in seconds.
+.TP
+.BI clocksource \fR=\fPstr
+Use the given clocksource as the base of timing. The supported options are:
+.RS
+.RS
+.TP
+.B gettimeofday
+\fBgettimeofday\fR\|(2)
+.TP
+.B clock_gettime
+\fBclock_gettime\fR\|(2)
+.TP
+.B cpu
+Internal CPU clock source
+.RE
+.P
+\fBcpu\fR is the preferred clocksource if it is reliable, as it is very fast (and
+fio is heavy on time calls). Fio will automatically use this clocksource if
+it's supported and considered reliable on the system it is running on,
+unless another clocksource is specifically set. For x86/x86\-64 CPUs, this
+means supporting TSC Invariant.
+.RE
+.TP
+.BI gtod_reduce \fR=\fPbool
+Enable all of the \fBgettimeofday\fR\|(2) reducing options
+(\fBdisable_clat\fR, \fBdisable_slat\fR, \fBdisable_bw_measurement\fR) plus
+reduce precision of the timeout somewhat to really shrink the
+\fBgettimeofday\fR\|(2) call count. With this option enabled, we only do
+about 0.4% of the \fBgettimeofday\fR\|(2) calls we would have done if all
+time keeping was enabled.
+.TP
+.BI gtod_cpu \fR=\fPint
+Sometimes it's cheaper to dedicate a single thread of execution to just
+getting the current time. Fio (and databases, for instance) are very
+intensive on \fBgettimeofday\fR\|(2) calls. With this option, you can set
+one CPU aside for doing nothing but logging current time to a shared memory
+location. Then the other threads/processes that run I/O workloads need only
+copy that segment, instead of entering the kernel with a
+\fBgettimeofday\fR\|(2) call. The CPU set aside for doing these time
+calls will be excluded from other uses. Fio will manually clear it from the
+CPU mask of other jobs.
+.SS "Target file/device"
 .TP
 .BI directory \fR=\fPstr
-Prefix filenames with this directory.  Used to place files in a location other
-than `./'.
-You can specify a number of directories by separating the names with a ':'
-character. These directories will be assigned equally distributed to job clones
-creates with \fInumjobs\fR as long as they are using generated filenames.
-If specific \fIfilename(s)\fR are set fio will use the first listed directory,
-and thereby matching the  \fIfilename\fR semantic which generates a file each
-clone if not specified, but let all clones use the same if set. See
-\fIfilename\fR for considerations regarding escaping certain characters on
-some platforms.
+Prefix \fBfilename\fRs with this directory. Used to place files in a different
+location than `./'. You can specify a number of directories by
+separating the names with a ':' character. These directories will be
+assigned equally distributed to job clones created by \fBnumjobs\fR as
+long as they are using generated filenames. If specific \fBfilename\fR(s) are
+set fio will use the first listed directory, and thereby matching the
+\fBfilename\fR semantic which generates a file each clone if not specified, but
+let all clones use the same if set.
+.RS
+.P
+See the \fBfilename\fR option for information on how to escape ':' and '\'
+characters within the directory path itself.
+.RE
 .TP
 .BI filename \fR=\fPstr
-.B fio
-normally makes up a file name based on the job name, thread number, and file
-number. If you want to share files between threads in a job or several jobs,
-specify a \fIfilename\fR for each of them to override the default.
-If the I/O engine is file-based, you can specify
-a number of files by separating the names with a `:' character. `\-' is a
-reserved name, meaning stdin or stdout, depending on the read/write direction
-set. On Windows, disk devices are accessed as \\.\PhysicalDrive0 for the first
-device, \\.\PhysicalDrive1 for the second etc. Note: Windows and FreeBSD
-prevent write access to areas of the disk containing in-use data
-(e.g. filesystems). If the wanted filename does need to include a colon, then
-escape that with a '\\' character. For instance, if the filename is
-"/dev/dsk/foo@3,0:c", then you would use filename="/dev/dsk/foo@3,0\\:c".
+Fio normally makes up a \fBfilename\fR based on the job name, thread number, and
+file number (see \fBfilename_format\fR). If you want to share files
+between threads in a job or several
+jobs with fixed file paths, specify a \fBfilename\fR for each of them to override
+the default. If the ioengine is file based, you can specify a number of files
+by separating the names with a ':' colon. So if you wanted a job to open
+`/dev/sda' and `/dev/sdb' as the two working files, you would use
+`filename=/dev/sda:/dev/sdb'. This also means that whenever this option is
+specified, \fBnrfiles\fR is ignored. The size of regular files specified
+by this option will be \fBsize\fR divided by number of files unless an
+explicit size is specified by \fBfilesize\fR.
+.RS
+.P
+Each colon and backslash in the wanted path must be escaped with a '\'
+character. For instance, if the path is `/dev/dsk/foo@3,0:c' then you
+would use `filename=/dev/dsk/foo@3,0\\:c' and if the path is
+`F:\\\\filename' then you would use `filename=F\\:\\\\filename'.
+.P
+On Windows, disk devices are accessed as `\\\\\\\\.\\\\PhysicalDrive0' for
+the first device, `\\\\\\\\.\\\\PhysicalDrive1' for the second etc.
+Note: Windows and FreeBSD prevent write access to areas
+of the disk containing in\-use data (e.g. filesystems).
+.P
+The filename `\-' is a reserved name, meaning *stdin* or *stdout*. Which
+of the two depends on the read/write direction set.
+.RE
 .TP
 .BI filename_format \fR=\fPstr
-If sharing multiple files between jobs, it is usually necessary to have
-fio generate the exact names that you want. By default, fio will name a file
+If sharing multiple files between jobs, it is usually necessary to have fio
+generate the exact names that you want. By default, fio will name a file
 based on the default file format specification of
-\fBjobname.jobnumber.filenumber\fP. With this option, that can be
+`jobname.jobnumber.filenumber'. With this option, that can be
 customized. Fio will recognize and replace the following keywords in this
 string:
 .RS
@@ -239,44 +571,168 @@
 The incremental number of the file for that worker thread or process.
 .RE
 .P
-To have dependent jobs share a set of files, this option can be set to
-have fio generate filenames that are shared between the two. For instance,
-if \fBtestfiles.$filenum\fR is specified, file number 4 for any job will
-be named \fBtestfiles.4\fR. The default of \fB$jobname.$jobnum.$filenum\fR
+To have dependent jobs share a set of files, this option can be set to have
+fio generate filenames that are shared between the two. For instance, if
+`testfiles.$filenum' is specified, file number 4 for any job will be
+named `testfiles.4'. The default of `$jobname.$jobnum.$filenum'
 will be used if no other format specifier is given.
 .RE
-.P
 .TP
 .BI unique_filename \fR=\fPbool
-To avoid collisions between networked clients, fio defaults to prefixing
-any generated filenames (with a directory specified) with the source of
-the client connecting. To disable this behavior, set this option to 0.
+To avoid collisions between networked clients, fio defaults to prefixing any
+generated filenames (with a directory specified) with the source of the
+client connecting. To disable this behavior, set this option to 0.
+.TP
+.BI opendir \fR=\fPstr
+Recursively open any files below directory \fIstr\fR.
 .TP
 .BI lockfile \fR=\fPstr
-Fio defaults to not locking any files before it does IO to them. If a file or
-file descriptor is shared, fio can serialize IO to that file to make the end
-result consistent. This is usual for emulating real workloads that share files.
-The lock modes are:
+Fio defaults to not locking any files before it does I/O to them. If a file
+or file descriptor is shared, fio can serialize I/O to that file to make the
+end result consistent. This is usual for emulating real workloads that share
+files. The lock modes are:
 .RS
 .RS
 .TP
 .B none
-No locking. This is the default.
+No locking. The default.
 .TP
 .B exclusive
-Only one thread or process may do IO at a time, excluding all others.
+Only one thread or process may do I/O at a time, excluding all others.
 .TP
 .B readwrite
-Read-write locking on the file. Many readers may access the file at the same
-time, but writes get exclusive access.
+Read\-write locking on the file. Many readers may
+access the file at the same time, but writes get exclusive access.
 .RE
 .RE
+.TP
+.BI nrfiles \fR=\fPint
+Number of files to use for this job. Defaults to 1. The size of files
+will be \fBsize\fR divided by this unless explicit size is specified by
+\fBfilesize\fR. Files are created for each thread separately, and each
+file will have a file number within its name by default, as explained in
+\fBfilename\fR section.
+.TP
+.BI openfiles \fR=\fPint
+Number of files to keep open at the same time. Defaults to the same as
+\fBnrfiles\fR, can be set smaller to limit the number simultaneous
+opens.
+.TP
+.BI file_service_type \fR=\fPstr
+Defines how fio decides which file from a job to service next. The following
+types are defined:
+.RS
+.RS
+.TP
+.B random
+Choose a file at random.
+.TP
+.B roundrobin
+Round robin over opened files. This is the default.
+.TP
+.B sequential
+Finish one file before moving on to the next. Multiple files can
+still be open depending on \fBopenfiles\fR.
+.TP
+.B zipf
+Use a Zipf distribution to decide what file to access.
+.TP
+.B pareto
+Use a Pareto distribution to decide what file to access.
+.TP
+.B normal
+Use a Gaussian (normal) distribution to decide what file to access.
+.TP
+.B gauss
+Alias for normal.
+.RE
 .P
-.BI opendir \fR=\fPstr
-Recursively open any files below directory \fIstr\fR.
+For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be appended to
+tell fio how many I/Os to issue before switching to a new file. For example,
+specifying `file_service_type=random:8' would cause fio to issue
+8 I/Os before selecting a new file at random. For the non\-uniform
+distributions, a floating point postfix can be given to influence how the
+distribution is skewed. See \fBrandom_distribution\fR for a description
+of how that would work.
+.RE
+.TP
+.BI ioscheduler \fR=\fPstr
+Attempt to switch the device hosting the file to the specified I/O scheduler
+before running.
+.TP
+.BI create_serialize \fR=\fPbool
+If true, serialize the file creation for the jobs. This may be handy to
+avoid interleaving of data files, which may greatly depend on the filesystem
+used and even the number of processors in the system. Default: true.
+.TP
+.BI create_fsync \fR=\fPbool
+\fBfsync\fR\|(2) the data file after creation. This is the default.
+.TP
+.BI create_on_open \fR=\fPbool
+If true, don't pre\-create files but allow the job's open() to create a file
+when it's time to do I/O. Default: false \-\- pre\-create all necessary files
+when the job starts.
+.TP
+.BI create_only \fR=\fPbool
+If true, fio will only run the setup phase of the job. If files need to be
+laid out or updated on disk, only that will be done \-\- the actual job contents
+are not executed. Default: false.
+.TP
+.BI allow_file_create \fR=\fPbool
+If true, fio is permitted to create files as part of its workload. If this
+option is false, then fio will error out if
+the files it needs to use don't already exist. Default: true.
+.TP
+.BI allow_mounted_write \fR=\fPbool
+If this isn't set, fio will abort jobs that are destructive (e.g. that write)
+to what appears to be a mounted device or partition. This should help catch
+creating inadvertently destructive tests, not realizing that the test will
+destroy data on the mounted file system. Note that some platforms don't allow
+writing against a mounted device regardless of this option. Default: false.
+.TP
+.BI pre_read \fR=\fPbool
+If this is given, files will be pre\-read into memory before starting the
+given I/O operation. This will also clear the \fBinvalidate\fR flag,
+since it is pointless to pre\-read and then drop the cache. This will only
+work for I/O engines that are seek\-able, since they allow you to read the
+same data multiple times. Thus it will not work on non\-seekable I/O engines
+(e.g. network, splice). Default: false.
+.TP
+.BI unlink \fR=\fPbool
+Unlink the job files when done. Not the default, as repeated runs of that
+job would then waste time recreating the file set again and again. Default:
+false.
+.TP
+.BI unlink_each_loop \fR=\fPbool
+Unlink job files after each iteration or loop. Default: false.
+.TP
+.BI zonesize \fR=\fPint
+Divide a file into zones of the specified size. See \fBzoneskip\fR.
+.TP
+.BI zonerange \fR=\fPint
+Give size of an I/O zone. See \fBzoneskip\fR.
+.TP
+.BI zoneskip \fR=\fPint
+Skip the specified number of bytes when \fBzonesize\fR data has been
+read. The two zone options can be used to only do I/O on zones of a file.
+.SS "I/O type"
+.TP
+.BI direct \fR=\fPbool
+If value is true, use non\-buffered I/O. This is usually O_DIRECT. Note that
+OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous
+ioengines don't support direct I/O. Default: false.
+.TP
+.BI atomic \fR=\fPbool
+If value is true, attempt to use atomic direct I/O. Atomic writes are
+guaranteed to be stable once acknowledged by the operating system. Only
+Linux supports O_ATOMIC right now.
+.TP
+.BI buffered \fR=\fPbool
+If value is true, use buffered I/O. This is the opposite of the
+\fBdirect\fR option. Defaults to true.
 .TP
 .BI readwrite \fR=\fPstr "\fR,\fP rw" \fR=\fPstr
-Type of I/O pattern.  Accepted values are:
+Type of I/O pattern. Accepted values are:
 .RS
 .RS
 .TP
@@ -287,7 +743,7 @@
 Sequential writes.
 .TP
 .B trim
-Sequential trim (Linux block devices only).
+Sequential trims (Linux block devices only).
 .TP
 .B randread
 Random reads.
@@ -296,73 +752,69 @@
 Random writes.
 .TP
 .B randtrim
-Random trim (Linux block devices only).
+Random trims (Linux block devices only).
 .TP
-.B rw, readwrite
-Mixed sequential reads and writes.
+.B rw,readwrite
+Sequential mixed reads and writes.
 .TP
 .B randrw
-Mixed random reads and writes.
+Random mixed reads and writes.
 .TP
 .B trimwrite
-Trim and write mixed workload. Blocks will be trimmed first, then the same
-blocks will be written to.
+Sequential trim+write sequences. Blocks will be trimmed first,
+then the same blocks will be written to.
 .RE
 .P
-Fio defaults to read if the option is not specified.
-For mixed I/O, the default split is 50/50. For certain types of io the result
-may still be skewed a bit, since the speed may be different. It is possible to
-specify a number of IO's to do before getting a new offset, this is done by
-appending a `:\fI<nr>\fR to the end of the string given. For a random read, it
-would look like \fBrw=randread:8\fR for passing in an offset modifier with a
-value of 8. If the postfix is used with a sequential IO pattern, then the value
-specified will be added to the generated offset for each IO. For instance,
-using \fBrw=write:4k\fR will skip 4k for every write. It turns sequential IO
-into sequential IO with holes. See the \fBrw_sequencer\fR option.
+Fio defaults to read if the option is not specified. For the mixed I/O
+types, the default is to split them 50/50. For certain types of I/O the
+result may still be skewed a bit, since the speed may be different.
+.P
+It is possible to specify the number of I/Os to do before getting a new
+offset by appending `:<nr>' to the end of the string given. For a
+random read, it would look like `rw=randread:8' for passing in an offset
+modifier with a value of 8. If the suffix is used with a sequential I/O
+pattern, then the `<nr>' value specified will be added to the generated
+offset for each I/O turning sequential I/O into sequential I/O with holes.
+For instance, using `rw=write:4k' will skip 4k for every write. Also see
+the \fBrw_sequencer\fR option.
 .RE
 .TP
 .BI rw_sequencer \fR=\fPstr
-If an offset modifier is given by appending a number to the \fBrw=<str>\fR line,
-then this option controls how that number modifies the IO offset being
-generated. Accepted values are:
+If an offset modifier is given by appending a number to the `rw=\fIstr\fR'
+line, then this option controls how that number modifies the I/O offset
+being generated. Accepted values are:
 .RS
 .RS
 .TP
 .B sequential
-Generate sequential offset
+Generate sequential offset.
 .TP
 .B identical
-Generate the same offset
+Generate the same offset.
 .RE
 .P
-\fBsequential\fR is only useful for random IO, where fio would normally
-generate a new random offset for every IO. If you append eg 8 to randread, you
-would get a new random offset for every 8 IO's. The result would be a seek for
-only every 8 IO's, instead of for every IO. Use \fBrw=randread:8\fR to specify
-that. As sequential IO is already sequential, setting \fBsequential\fR for that
-would not result in any differences.  \fBidentical\fR behaves in a similar
-fashion, except it sends the same offset 8 number of times before generating a
-new offset.
+\fBsequential\fR is only useful for random I/O, where fio would normally
+generate a new random offset for every I/O. If you append e.g. 8 to randread,
+you would get a new random offset for every 8 I/Os. The result would be a
+seek for only every 8 I/Os, instead of for every I/O. Use `rw=randread:8'
+to specify that. As sequential I/O is already sequential, setting
+\fBsequential\fR for that would not result in any differences. \fBidentical\fR
+behaves in a similar fashion, except it sends the same offset 8 number of
+times before generating a new offset.
 .RE
-.P
-.TP
-.BI kb_base \fR=\fPint
-The base unit for a kilobyte. The defacto base is 2^10, 1024.  Storage
-manufacturers like to use 10^3 or 1000 as a base ten unit instead, for obvious
-reasons. Allowed values are 1024 or 1000, with 1024 being the default.
 .TP
 .BI unified_rw_reporting \fR=\fPbool
 Fio normally reports statistics on a per data direction basis, meaning that
-read, write, and trim are accounted and reported separately. If this option is
-set fio sums the results and reports them as "mixed" instead.
+reads, writes, and trims are accounted and reported separately. If this
+option is set fio sums the results and report them as "mixed" instead.
 .TP
 .BI randrepeat \fR=\fPbool
-Seed the random number generator used for random I/O patterns in a predictable
-way so the pattern is repeatable across runs.  Default: true.
+Seed the random number generator used for random I/O patterns in a
+predictable way so the pattern is repeatable across runs. Default: true.
 .TP
 .BI allrandrepeat \fR=\fPbool
 Seed all random number generators in a predictable way so results are
-repeatable across runs.  Default: false.
+repeatable across runs. Default: false.
 .TP
 .BI randseed \fR=\fPint
 Seed the random number generators based on this seed value, to be able to
@@ -370,30 +822,36 @@
 sequence depends on the \fBrandrepeat\fR setting.
 .TP
 .BI fallocate \fR=\fPstr
-Whether pre-allocation is performed when laying down files. Accepted values
-are:
+Whether pre\-allocation is performed when laying down files.
+Accepted values are:
 .RS
 .RS
 .TP
 .B none
-Do not pre-allocate space.
+Do not pre\-allocate space.
+.TP
+.B native
+Use a platform's native pre\-allocation call but fall back to
+\fBnone\fR behavior if it fails/is not implemented.
 .TP
 .B posix
-Pre-allocate via \fBposix_fallocate\fR\|(3).
+Pre\-allocate via \fBposix_fallocate\fR\|(3).
 .TP
 .B keep
-Pre-allocate via \fBfallocate\fR\|(2) with FALLOC_FL_KEEP_SIZE set.
+Pre\-allocate via \fBfallocate\fR\|(2) with
+FALLOC_FL_KEEP_SIZE set.
 .TP
 .B 0
-Backward-compatible alias for 'none'.
+Backward\-compatible alias for \fBnone\fR.
 .TP
 .B 1
-Backward-compatible alias for 'posix'.
+Backward\-compatible alias for \fBposix\fR.
 .RE
 .P
-May not be available on all supported platforms. 'keep' is only
-available on Linux. If using ZFS on Solaris this must be set to 'none'
-because ZFS doesn't support it. Default: 'posix'.
+May not be available on all supported platforms. \fBkeep\fR is only available
+on Linux. If using ZFS on Solaris this cannot be set to \fBposix\fR
+because ZFS doesn't support pre\-allocation. Default: \fBnative\fR if any
+pre\-allocation methods are available, \fBnone\fR if not.
 .RE
 .TP
 .BI fadvise_hint \fR=\fPstr
@@ -407,225 +865,569 @@
 .TP
 .B 1
 Backwards compatible hint for "advise with fio workload type". This
-uses \fBFADV_RANDOM\fR for a random workload, and \fBFADV_SEQUENTIAL\fR
+uses FADV_RANDOM for a random workload, and FADV_SEQUENTIAL
 for a sequential workload.
 .TP
 .B sequential
-Advise using \fBFADV_SEQUENTIAL\fR
+Advise using FADV_SEQUENTIAL.
 .TP
 .B random
-Advise using \fBFADV_RANDOM\fR
+Advise using FADV_RANDOM.
 .RE
 .RE
 .TP
-.BI fadvise_stream \fR=\fPint
-Use \fBposix_fadvise\fR\|(2) to advise the kernel what stream ID the
-writes issued belong to. Only supported on Linux. Note, this option
-may change going forward.
+.BI write_hint \fR=\fPstr
+Use \fBfcntl\fR\|(2) to advise the kernel what life time to expect
+from a write. Only supported on Linux, as of version 4.13. Accepted
+values are:
+.RS
+.RS
 .TP
-.BI size \fR=\fPint
-Total size of I/O for this job.  \fBfio\fR will run until this many bytes have
-been transferred, unless limited by other options (\fBruntime\fR, for instance,
-or increased/descreased by \fBio_size\fR). Unless \fBnrfiles\fR and
-\fBfilesize\fR options are given, this amount will be divided between the
-available files for the job. If not set, fio will use the full size of the
-given files or devices. If the files do not exist, size must be given. It is
-also possible to give size as a percentage between 1 and 100. If size=20% is
-given, fio will use 20% of the full size of the given files or devices.
+.B none
+No particular life time associated with this file.
 .TP
-.BI io_size \fR=\fPint "\fR,\fB io_limit \fR=\fPint
-Normally fio operates within the region set by \fBsize\fR, which means that
-the \fBsize\fR option sets both the region and size of IO to be performed.
-Sometimes that is not what you want. With this option, it is possible to
-define just the amount of IO that fio should do. For instance, if \fBsize\fR
-is set to 20G and \fBio_limit\fR is set to 5G, fio will perform IO within
-the first 20G but exit when 5G have been done. The opposite is also
-possible - if \fBsize\fR is set to 20G, and \fBio_size\fR is set to 40G, then
-fio will do 40G of IO within the 0..20G region.
+.B short
+Data written to this file has a short life time.
 .TP
-.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
-Sets size to something really large and waits for ENOSPC (no space left on
-device) as the terminating condition. Only makes sense with sequential write.
-For a read workload, the mount point will be filled first then IO started on
-the result. This option doesn't make sense if operating on a raw device node,
-since the size of that is already known by the file system. Additionally,
-writing beyond end-of-device will not return ENOSPC there.
-.TP
-.BI filesize \fR=\fPirange
-Individual file sizes. May be a range, in which case \fBfio\fR will select sizes
-for files at random within the given range, limited to \fBsize\fR in total (if
-that is given). If \fBfilesize\fR is not specified, each created file is the
-same size.
+.B medium
+Data written to this file has a medium life time.
 .TP
-.BI file_append \fR=\fPbool
-Perform IO after the end of the file. Normally fio will operate within the
-size of a file. If this option is set, then fio will append to the file
-instead. This has identical behavior to setting \fRoffset\fP to the size
-of a file. This option is ignored on non-regular files.
+.B long
+Data written to this file has a long life time.
 .TP
-.BI blocksize \fR=\fPint[,int] "\fR,\fB bs" \fR=\fPint[,int]
-Block size for I/O units.  Default: 4k.  Values for reads, writes, and trims
-can be specified separately in the format \fIread\fR,\fIwrite\fR,\fItrim\fR
-either of which may be empty to leave that value at its default. If a trailing
-comma isn't given, the remainder will inherit the last value set.
-.TP
-.BI blocksize_range \fR=\fPirange[,irange] "\fR,\fB bsrange" \fR=\fPirange[,irange]
-Specify a range of I/O block sizes.  The issued I/O unit will always be a
-multiple of the minimum size, unless \fBblocksize_unaligned\fR is set.  Applies
-to both reads and writes if only one range is given, but can be specified
-separately with a comma separating the values. Example: bsrange=1k-4k,2k-8k.
-Also (see \fBblocksize\fR).
-.TP
-.BI bssplit \fR=\fPstr
-This option allows even finer grained control of the block sizes issued,
-not just even splits between them. With this option, you can weight various
-block sizes for exact control of the issued IO for a job that has mixed
-block sizes. The format of the option is bssplit=blocksize/percentage,
-optionally adding as many definitions as needed separated by a colon.
-Example: bssplit=4k/10:64k/50:32k/40 would issue 50% 64k blocks, 10% 4k
-blocks and 40% 32k blocks. \fBbssplit\fR also supports giving separate
-splits to reads and writes. The format is identical to what the
-\fBbs\fR option accepts, the read and write parts are separated with a
-comma.
-.TP
-.B blocksize_unaligned\fR,\fP bs_unaligned
-If set, any size in \fBblocksize_range\fR may be used.  This typically won't
-work with direct I/O, as that normally requires sector alignment.
-.TP
-.BI blockalign \fR=\fPint[,int] "\fR,\fB ba" \fR=\fPint[,int]
-At what boundary to align random IO offsets. Defaults to the same as 'blocksize'
-the minimum blocksize given.  Minimum alignment is typically 512b
-for using direct IO, though it usually depends on the hardware block size.
-This option is mutually exclusive with using a random map for files, so it
-will turn off that option.
+.B extreme
+Data written to this file has a very long life time.
+.RE
+.P
+The values are all relative to each other, and no absolute meaning
+should be associated with them.
+.RE
+.TP
+.BI offset \fR=\fPint
+Start I/O at the provided offset in the file, given as either a fixed size in
+bytes or a percentage. If a percentage is given, the next \fBblockalign\fR\-ed
+offset will be used. Data before the given offset will not be touched. This
+effectively caps the file size at `real_size \- offset'. Can be combined with
+\fBsize\fR to constrain the start and end range of the I/O workload.
+A percentage can be specified by a number between 1 and 100 followed by '%',
+for example, `offset=20%' to specify 20%.
+.TP
+.BI offset_increment \fR=\fPint
+If this is provided, then the real offset becomes `\fBoffset\fR + \fBoffset_increment\fR
+* thread_number', where the thread number is a counter that starts at 0 and
+is incremented for each sub\-job (i.e. when \fBnumjobs\fR option is
+specified). This option is useful if there are several jobs which are
+intended to operate on a file in parallel disjoint segments, with even
+spacing between the starting points.
+.TP
+.BI number_ios \fR=\fPint
+Fio will normally perform I/Os until it has exhausted the size of the region
+set by \fBsize\fR, or if it exhaust the allocated time (or hits an error
+condition). With this setting, the range/size can be set independently of
+the number of I/Os to perform. When fio reaches this number, it will exit
+normally and report status. Note that this does not extend the amount of I/O
+that will be done, it will only stop fio if this condition is met before
+other end\-of\-job criteria.
+.TP
+.BI fsync \fR=\fPint
+If writing to a file, issue an \fBfsync\fR\|(2) (or its equivalent) of
+the dirty data for every number of blocks given. For example, if you give 32
+as a parameter, fio will sync the file after every 32 writes issued. If fio is
+using non\-buffered I/O, we may not sync the file. The exception is the sg
+I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
+means fio does not periodically issue and wait for a sync to complete. Also
+see \fBend_fsync\fR and \fBfsync_on_close\fR.
+.TP
+.BI fdatasync \fR=\fPint
+Like \fBfsync\fR but uses \fBfdatasync\fR\|(2) to only sync data and
+not metadata blocks. In Windows, FreeBSD, and DragonFlyBSD there is no
+\fBfdatasync\fR\|(2) so this falls back to using \fBfsync\fR\|(2).
+Defaults to 0, which means fio does not periodically issue and wait for a
+data\-only sync to complete.
+.TP
+.BI write_barrier \fR=\fPint
+Make every N\-th write a barrier write.
+.TP
+.BI sync_file_range \fR=\fPstr:int
+Use \fBsync_file_range\fR\|(2) for every \fIint\fR number of write
+operations. Fio will track range of writes that have happened since the last
+\fBsync_file_range\fR\|(2) call. \fIstr\fR can currently be one or more of:
+.RS
+.RS
+.TP
+.B wait_before
+SYNC_FILE_RANGE_WAIT_BEFORE
+.TP
+.B write
+SYNC_FILE_RANGE_WRITE
+.TP
+.B wait_after
+SYNC_FILE_RANGE_WRITE_AFTER
+.RE
+.P
+So if you do `sync_file_range=wait_before,write:8', fio would use
+`SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE' for every 8
+writes. Also see the \fBsync_file_range\fR\|(2) man page. This option is
+Linux specific.
+.RE
+.TP
+.BI overwrite \fR=\fPbool
+If true, writes to a file will always overwrite existing data. If the file
+doesn't already exist, it will be created before the write phase begins. If
+the file exists and is large enough for the specified write phase, nothing
+will be done. Default: false.
+.TP
+.BI end_fsync \fR=\fPbool
+If true, \fBfsync\fR\|(2) file contents when a write stage has completed.
+Default: false.
+.TP
+.BI fsync_on_close \fR=\fPbool
+If true, fio will \fBfsync\fR\|(2) a dirty file on close. This differs
+from \fBend_fsync\fR in that it will happen on every file close, not
+just at the end of the job. Default: false.
+.TP
+.BI rwmixread \fR=\fPint
+Percentage of a mixed workload that should be reads. Default: 50.
+.TP
+.BI rwmixwrite \fR=\fPint
+Percentage of a mixed workload that should be writes. If both
+\fBrwmixread\fR and \fBrwmixwrite\fR is given and the values do not
+add up to 100%, the latter of the two will be used to override the
+first. This may interfere with a given rate setting, if fio is asked to
+limit reads or writes to a certain rate. If that is the case, then the
+distribution may be skewed. Default: 50.
+.TP
+.BI random_distribution \fR=\fPstr:float[,str:float][,str:float]
+By default, fio will use a completely uniform random distribution when asked
+to perform random I/O. Sometimes it is useful to skew the distribution in
+specific ways, ensuring that some parts of the data is more hot than others.
+fio includes the following distribution models:
+.RS
+.RS
+.TP
+.B random
+Uniform random distribution
+.TP
+.B zipf
+Zipf distribution
+.TP
+.B pareto
+Pareto distribution
+.TP
+.B normal
+Normal (Gaussian) distribution
+.TP
+.B zoned
+Zoned random distribution
+.RE
+.P
+When using a \fBzipf\fR or \fBpareto\fR distribution, an input value is also
+needed to define the access pattern. For \fBzipf\fR, this is the `Zipf theta'.
+For \fBpareto\fR, it's the `Pareto power'. Fio includes a test
+program, \fBfio\-genzipf\fR, that can be used visualize what the given input
+values will yield in terms of hit rates. If you wanted to use \fBzipf\fR with
+a `theta' of 1.2, you would use `random_distribution=zipf:1.2' as the
+option. If a non\-uniform model is used, fio will disable use of the random
+map. For the \fBnormal\fR distribution, a normal (Gaussian) deviation is
+supplied as a value between 0 and 100.
+.P
+For a \fBzoned\fR distribution, fio supports specifying percentages of I/O
+access that should fall within what range of the file or device. For
+example, given a criteria of:
+.RS
+.P
+.PD 0
+60% of accesses should be to the first 10%
+.P
+30% of accesses should be to the next 20%
+.P
+8% of accesses should be to the next 30%
+.P
+2% of accesses should be to the next 40%
+.PD
+.RE
+.P
+we can define that through zoning of the random accesses. For the above
+example, the user would do:
+.RS
+.P
+random_distribution=zoned:60/10:30/20:8/30:2/40
+.RE
+.P
+similarly to how \fBbssplit\fR works for setting ranges and percentages
+of block sizes. Like \fBbssplit\fR, it's possible to specify separate
+zones for reads, writes, and trims. If just one set is given, it'll apply to
+all of them.
+.RE
+.TP
+.BI percentage_random \fR=\fPint[,int][,int]
+For a random workload, set how big a percentage should be random. This
+defaults to 100%, in which case the workload is fully random. It can be set
+from anywhere from 0 to 100. Setting it to 0 would make the workload fully
+sequential. Any setting in between will result in a random mix of sequential
+and random I/O, at the given percentages. Comma\-separated values may be
+specified for reads, writes, and trims as described in \fBblocksize\fR.
+.TP
+.BI norandommap
+Normally fio will cover every block of the file when doing random I/O. If
+this option is given, fio will just get a new random offset without looking
+at past I/O history. This means that some blocks may not be read or written,
+and that some blocks may be read/written more than once. If this option is
+used with \fBverify\fR and multiple blocksizes (via \fBbsrange\fR),
+only intact blocks are verified, i.e., partially\-overwritten blocks are
+ignored.
+.TP
+.BI softrandommap \fR=\fPbool
+See \fBnorandommap\fR. If fio runs with the random block map enabled and
+it fails to allocate the map, if this option is set it will continue without
+a random block map. As coverage will not be as complete as with random maps,
+this option is disabled by default.
+.TP
+.BI random_generator \fR=\fPstr
+Fio supports the following engines for generating I/O offsets for random I/O:
+.RS
+.RS
+.TP
+.B tausworthe
+Strong 2^88 cycle random number generator.
+.TP
+.B lfsr
+Linear feedback shift register generator.
+.TP
+.B tausworthe64
+Strong 64\-bit 2^258 cycle random number generator.
+.RE
+.P
+\fBtausworthe\fR is a strong random number generator, but it requires tracking
+on the side if we want to ensure that blocks are only read or written
+once. \fBlfsr\fR guarantees that we never generate the same offset twice, and
+it's also less computationally expensive. It's not a true random generator,
+however, though for I/O purposes it's typically good enough. \fBlfsr\fR only
+works with single block sizes, not with workloads that use multiple block
+sizes. If used with such a workload, fio may read or write some blocks
+multiple times. The default value is \fBtausworthe\fR, unless the required
+space exceeds 2^32 blocks. If it does, then \fBtausworthe64\fR is
+selected automatically.
+.RE
+.SS "Block size"
+.TP
+.BI blocksize \fR=\fPint[,int][,int] "\fR,\fB bs" \fR=\fPint[,int][,int]
+The block size in bytes used for I/O units. Default: 4096. A single value
+applies to reads, writes, and trims. Comma\-separated values may be
+specified for reads, writes, and trims. A value not terminated in a comma
+applies to subsequent types. Examples:
+.RS
+.RS
+.P
+.PD 0
+bs=256k        means 256k for reads, writes and trims.
+.P
+bs=8k,32k      means 8k for reads, 32k for writes and trims.
+.P
+bs=8k,32k,     means 8k for reads, 32k for writes, and default for trims.
+.P
+bs=,8k         means default for reads, 8k for writes and trims.
+.P
+bs=,8k,        means default for reads, 8k for writes, and default for trims.
+.PD
+.RE
+.RE
+.TP
+.BI blocksize_range \fR=\fPirange[,irange][,irange] "\fR,\fB bsrange" \fR=\fPirange[,irange][,irange]
+A range of block sizes in bytes for I/O units. The issued I/O unit will
+always be a multiple of the minimum size, unless
+\fBblocksize_unaligned\fR is set.
+Comma\-separated ranges may be specified for reads, writes, and trims as
+described in \fBblocksize\fR. Example:
+.RS
+.RS
+.P
+bsrange=1k\-4k,2k\-8k
+.RE
+.RE
+.TP
+.BI bssplit \fR=\fPstr[,str][,str]
+Sometimes you want even finer grained control of the block sizes issued, not
+just an even split between them. This option allows you to weight various
+block sizes, so that you are able to define a specific amount of block sizes
+issued. The format for this option is:
+.RS
+.RS
+.P
+bssplit=blocksize/percentage:blocksize/percentage
+.RE
+.P
+for as many block sizes as needed. So if you want to define a workload that
+has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would write:
+.RS
+.P
+bssplit=4k/10:64k/50:32k/40
+.RE
+.P
+Ordering does not matter. If the percentage is left blank, fio will fill in
+the remaining values evenly. So a bssplit option like this one:
+.RS
+.P
+bssplit=4k/50:1k/:32k/
+.RE
+.P
+would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always add up
+to 100, if bssplit is given a range that adds up to more, it will error out.
+.P
+Comma\-separated values may be specified for reads, writes, and trims as
+described in \fBblocksize\fR.
+.P
+If you want a workload that has 50% 2k reads and 50% 4k reads, while having
+90% 4k writes and 10% 8k writes, you would specify:
+.RS
+.P
+bssplit=2k/50:4k/50,4k/90,8k/10
+.RE
+.RE
+.TP
+.BI blocksize_unaligned "\fR,\fB bs_unaligned"
+If set, fio will issue I/O units with any size within
+\fBblocksize_range\fR, not just multiples of the minimum size. This
+typically won't work with direct I/O, as that normally requires sector
+alignment.
 .TP
 .BI bs_is_seq_rand \fR=\fPbool
-If this option is set, fio will use the normal read,write blocksize settings as
-sequential,random instead. Any random read or write will use the WRITE
-blocksize settings, and any sequential read or write will use the READ
-blocksize setting.
+If this option is set, fio will use the normal read,write blocksize settings
+as sequential,random blocksize settings instead. Any random read or write
+will use the WRITE blocksize settings, and any sequential read or write will
+use the READ blocksize settings.
+.TP
+.BI blockalign \fR=\fPint[,int][,int] "\fR,\fB ba" \fR=\fPint[,int][,int]
+Boundary to which fio will align random I/O units. Default:
+\fBblocksize\fR. Minimum alignment is typically 512b for using direct
+I/O, though it usually depends on the hardware block size. This option is
+mutually exclusive with using a random map for files, so it will turn off
+that option. Comma\-separated values may be specified for reads, writes, and
+trims as described in \fBblocksize\fR.
+.SS "Buffers and memory"
 .TP
-.B zero_buffers
+.BI zero_buffers
 Initialize buffers with all zeros. Default: fill buffers with random data.
 .TP
-.B refill_buffers
-If this option is given, fio will refill the IO buffers on every submit. The
-default is to only fill it at init time and reuse that data. Only makes sense
-if zero_buffers isn't specified, naturally. If data verification is enabled,
-refill_buffers is also automatically enabled.
+.BI refill_buffers
+If this option is given, fio will refill the I/O buffers on every
+submit. The default is to only fill it at init time and reuse that
+data. Only makes sense if zero_buffers isn't specified, naturally. If data
+verification is enabled, \fBrefill_buffers\fR is also automatically enabled.
 .TP
 .BI scramble_buffers \fR=\fPbool
 If \fBrefill_buffers\fR is too costly and the target is using data
-deduplication, then setting this option will slightly modify the IO buffer
-contents to defeat normal de-dupe attempts. This is not enough to defeat
-more clever block compression attempts, but it will stop naive dedupe
-of blocks. Default: true.
+deduplication, then setting this option will slightly modify the I/O buffer
+contents to defeat normal de\-dupe attempts. This is not enough to defeat
+more clever block compression attempts, but it will stop naive dedupe of
+blocks. Default: true.
 .TP
 .BI buffer_compress_percentage \fR=\fPint
-If this is set, then fio will attempt to provide IO buffer content (on WRITEs)
-that compress to the specified level. Fio does this by providing a mix of
-random data and a fixed pattern. The fixed pattern is either zeroes, or the
-pattern specified by \fBbuffer_pattern\fR. If the pattern option is used, it
-might skew the compression ratio slightly. Note that this is per block size
-unit, for file/disk wide compression level that matches this setting. Note
-that this is per block size unit, for file/disk wide compression level that
-matches this setting, you'll also want to set refill_buffers.
+If this is set, then fio will attempt to provide I/O buffer content (on
+WRITEs) that compresses to the specified level. Fio does this by providing a
+mix of random data and a fixed pattern. The fixed pattern is either zeros,
+or the pattern specified by \fBbuffer_pattern\fR. If the pattern option
+is used, it might skew the compression ratio slightly. Note that this is per
+block size unit, for file/disk wide compression level that matches this
+setting, you'll also want to set \fBrefill_buffers\fR.
 .TP
 .BI buffer_compress_chunk \fR=\fPint
-See \fBbuffer_compress_percentage\fR. This setting allows fio to manage how
-big the ranges of random data and zeroed data is. Without this set, fio will
-provide \fBbuffer_compress_percentage\fR of blocksize random data, followed by
-the remaining zeroed. With this set to some chunk size smaller than the block
-size, fio can alternate random and zeroed data throughout the IO buffer.
+See \fBbuffer_compress_percentage\fR. This setting allows fio to manage
+how big the ranges of random data and zeroed data is. Without this set, fio
+will provide \fBbuffer_compress_percentage\fR of blocksize random data,
+followed by the remaining zeroed. With this set to some chunk size smaller
+than the block size, fio can alternate random and zeroed data throughout the
+I/O buffer.
 .TP
 .BI buffer_pattern \fR=\fPstr
-If set, fio will fill the IO buffers with this pattern. If not set, the contents
-of IO buffers is defined by the other options related to buffer contents. The
-setting can be any pattern of bytes, and can be prefixed with 0x for hex
-values. It may also be a string, where the string must then be wrapped with
-"", e.g.:
-.RS
+If set, fio will fill the I/O buffers with this pattern or with the contents
+of a file. If not set, the contents of I/O buffers are defined by the other
+options related to buffer contents. The setting can be any pattern of bytes,
+and can be prefixed with 0x for hex values. It may also be a string, where
+the string must then be wrapped with "". Or it may also be a filename,
+where the filename must be wrapped with '' in which case the file is
+opened and read. Note that not all the file contents will be read if that
+would cause the buffers to overflow. So, for example:
 .RS
-\fBbuffer_pattern\fR="abcd"
 .RS
-or
-.RE
-\fBbuffer_pattern\fR=-12
-.RS
-or
-.RE
-\fBbuffer_pattern\fR=0xdeadface
+.P
+.PD 0
+buffer_pattern='filename'
+.P
+or:
+.P
+buffer_pattern="abcd"
+.P
+or:
+.P
+buffer_pattern=\-12
+.P
+or:
+.P
+buffer_pattern=0xdeadface
+.PD
 .RE
-.LP
+.P
 Also you can combine everything together in any order:
-.LP
 .RS
-\fBbuffer_pattern\fR=0xdeadface"abcd"-12
+.P
+buffer_pattern=0xdeadface"abcd"\-12'filename'
 .RE
 .RE
 .TP
 .BI dedupe_percentage \fR=\fPint
-If set, fio will generate this percentage of identical buffers when writing.
-These buffers will be naturally dedupable. The contents of the buffers depend
-on what other buffer compression settings have been set. It's possible to have
-the individual buffers either fully compressible, or not at all. This option
-only controls the distribution of unique buffers.
+If set, fio will generate this percentage of identical buffers when
+writing. These buffers will be naturally dedupable. The contents of the
+buffers depend on what other buffer compression settings have been set. It's
+possible to have the individual buffers either fully compressible, or not at
+all. This option only controls the distribution of unique buffers.
 .TP
-.BI nrfiles \fR=\fPint
-Number of files to use for this job.  Default: 1.
+.BI invalidate \fR=\fPbool
+Invalidate the buffer/page cache parts of the files to be used prior to
+starting I/O if the platform and file type support it. Defaults to true.
+This will be ignored if \fBpre_read\fR is also specified for the
+same job.
 .TP
-.BI openfiles \fR=\fPint
-Number of files to keep open at the same time.  Default: \fBnrfiles\fR.
+.BI sync \fR=\fPbool
+Use synchronous I/O for buffered writes. For the majority of I/O engines,
+this means using O_SYNC. Default: false.
 .TP
-.BI file_service_type \fR=\fPstr
-Defines how files to service are selected.  The following types are defined:
+.BI iomem \fR=\fPstr "\fR,\fP mem" \fR=\fPstr
+Fio can use various types of memory as the I/O unit buffer. The allowed
+values are:
 .RS
 .RS
 .TP
-.B random
-Choose a file at random.
+.B malloc
+Use memory from \fBmalloc\fR\|(3) as the buffers. Default memory type.
 .TP
-.B roundrobin
-Round robin over opened files (default).
+.B shm
+Use shared memory as the buffers. Allocated through \fBshmget\fR\|(2).
 .TP
-.B sequential
-Do each file in the set sequentially.
+.B shmhuge
+Same as \fBshm\fR, but use huge pages as backing.
 .TP
-.B zipf
-Use a zipfian distribution to decide what file to access.
+.B mmap
+Use \fBmmap\fR\|(2) to allocate buffers. May either be anonymous memory, or can
+be file backed if a filename is given after the option. The format
+is `mem=mmap:/path/to/file'.
 .TP
-.B pareto
-Use a pareto distribution to decide what file to access.
+.B mmaphuge
+Use a memory mapped huge file as the buffer backing. Append filename
+after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file'.
 .TP
-.B gauss
-Use a gaussian (normal) distribution to decide what file to access.
-.RE
-.P
-For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be
-appended to tell fio how many I/Os to issue before switching to a new file.
-For example, specifying \fBfile_service_type=random:8\fR would cause fio to
-issue \fI8\fR I/Os before selecting a new file at random. For the non-uniform
-distributions, a floating point postfix can be given to influence how the
-distribution is skewed. See \fBrandom_distribution\fR for a description of how
-that would work.
+.B mmapshared
+Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
+.TP
+.B cudamalloc
+Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+The \fBioengine\fR must be \fBrdma\fR.
+.RE
+.P
+The area allocated is a function of the maximum allowed bs size for the job,
+multiplied by the I/O depth given. Note that for \fBshmhuge\fR and
+\fBmmaphuge\fR to work, the system must have free huge pages allocated. This
+can normally be checked and set by reading/writing
+`/proc/sys/vm/nr_hugepages' on a Linux system. Fio assumes a huge page
+is 4MiB in size. So to calculate the number of huge pages you need for a
+given job file, add up the I/O depth of all jobs (normally one unless
+\fBiodepth\fR is used) and multiply by the maximum bs set. Then divide
+that number by the huge page size. You can see the size of the huge pages in
+`/proc/meminfo'. If no huge pages are allocated by having a non\-zero
+number in `nr_hugepages', using \fBmmaphuge\fR or \fBshmhuge\fR will fail. Also
+see \fBhugepage\-size\fR.
+.P
+\fBmmaphuge\fR also needs to have hugetlbfs mounted and the file location
+should point there. So if it's mounted in `/huge', you would use
+`mem=mmaphuge:/huge/somefile'.
 .RE
 .TP
+.BI iomem_align \fR=\fPint "\fR,\fP mem_align" \fR=\fPint
+This indicates the memory alignment of the I/O memory buffers. Note that
+the given alignment is applied to the first I/O unit buffer, if using
+\fBiodepth\fR the alignment of the following buffers are given by the
+\fBbs\fR used. In other words, if using a \fBbs\fR that is a
+multiple of the page sized in the system, all buffers will be aligned to
+this value. If using a \fBbs\fR that is not page aligned, the alignment
+of subsequent I/O memory buffers is the sum of the \fBiomem_align\fR and
+\fBbs\fR used.
+.TP
+.BI hugepage\-size \fR=\fPint
+Defines the size of a huge page. Must at least be equal to the system
+setting, see `/proc/meminfo'. Defaults to 4MiB. Should probably
+always be a multiple of megabytes, so using `hugepage\-size=Xm' is the
+preferred way to set this to avoid setting a non\-pow\-2 bad value.
+.TP
+.BI lockmem \fR=\fPint
+Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to
+simulate a smaller amount of memory. The amount specified is per worker.
+.SS "I/O size"
+.TP
+.BI size \fR=\fPint
+The total size of file I/O for each thread of this job. Fio will run until
+this many bytes has been transferred, unless runtime is limited by other options
+(such as \fBruntime\fR, for instance, or increased/decreased by \fBio_size\fR).
+Fio will divide this size between the available files determined by options
+such as \fBnrfiles\fR, \fBfilename\fR, unless \fBfilesize\fR is
+specified by the job. If the result of division happens to be 0, the size is
+set to the physical size of the given files or devices if they exist.
+If this option is not specified, fio will use the full size of the given
+files or devices. If the files do not exist, size must be given. It is also
+possible to give size as a percentage between 1 and 100. If `size=20%' is
+given, fio will use 20% of the full size of the given files or devices.
+Can be combined with \fBoffset\fR to constrain the start and end range
+that I/O will be done within.
+.TP
+.BI io_size \fR=\fPint "\fR,\fB io_limit" \fR=\fPint
+Normally fio operates within the region set by \fBsize\fR, which means
+that the \fBsize\fR option sets both the region and size of I/O to be
+performed. Sometimes that is not what you want. With this option, it is
+possible to define just the amount of I/O that fio should do. For instance,
+if \fBsize\fR is set to 20GiB and \fBio_size\fR is set to 5GiB, fio
+will perform I/O within the first 20GiB but exit when 5GiB have been
+done. The opposite is also possible \-\- if \fBsize\fR is set to 20GiB,
+and \fBio_size\fR is set to 40GiB, then fio will do 40GiB of I/O within
+the 0..20GiB region.
+.TP
+.BI filesize \fR=\fPirange(int)
+Individual file sizes. May be a range, in which case fio will select sizes
+for files at random within the given range and limited to \fBsize\fR in
+total (if that is given). If not given, each created file is the same size.
+This option overrides \fBsize\fR in terms of file size, which means
+this value is used as a fixed size or possible range of each file.
+.TP
+.BI file_append \fR=\fPbool
+Perform I/O after the end of the file. Normally fio will operate within the
+size of a file. If this option is set, then fio will append to the file
+instead. This has identical behavior to setting \fBoffset\fR to the size
+of a file. This option is ignored on non\-regular files.
+.TP
+.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
+Sets size to something really large and waits for ENOSPC (no space left on
+device) as the terminating condition. Only makes sense with sequential
+write. For a read workload, the mount point will be filled first then I/O
+started on the result. This option doesn't make sense if operating on a raw
+device node, since the size of that is already known by the file system.
+Additionally, writing beyond end\-of\-device will not return ENOSPC there.
+.SS "I/O engine"
+.TP
 .BI ioengine \fR=\fPstr
-Defines how the job issues I/O.  The following types are defined:
+Defines how the job issues I/O to the file. The following types are defined:
 .RS
 .RS
 .TP
 .B sync
-Basic \fBread\fR\|(2) or \fBwrite\fR\|(2) I/O.  \fBfseek\fR\|(2) is used to
-position the I/O location.
+Basic \fBread\fR\|(2) or \fBwrite\fR\|(2)
+I/O. \fBlseek\fR\|(2) is used to position the I/O location.
+See \fBfsync\fR and \fBfdatasync\fR for syncing write I/Os.
 .TP
 .B psync
-Basic \fBpread\fR\|(2) or \fBpwrite\fR\|(2) I/O.
-Default on all supported operating systems except for Windows.
+Basic \fBpread\fR\|(2) or \fBpwrite\fR\|(2) I/O. Default on
+all supported operating systems except for Windows.
 .TP
 .B vsync
-Basic \fBreadv\fR\|(2) or \fBwritev\fR\|(2) I/O. Will emulate queuing by
-coalescing adjacent IOs into a single submission.
+Basic \fBreadv\fR\|(2) or \fBwritev\fR\|(2) I/O. Will emulate
+queuing by coalescing adjacent I/Os into a single submission.
 .TP
 .B pvsync
 Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O.
@@ -634,10 +1436,14 @@
 Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O.
 .TP
 .B libaio
-Linux native asynchronous I/O. This ioengine defines engine specific options.
+Linux native asynchronous I/O. Note that Linux may only support
+queued behavior with non\-buffered I/O (set `direct=1' or
+`buffered=0').
+This engine defines engine specific options.
 .TP
 .B posixaio
-POSIX asynchronous I/O using \fBaio_read\fR\|(3) and \fBaio_write\fR\|(3).
+POSIX asynchronous I/O using \fBaio_read\fR\|(3) and
+\fBaio_write\fR\|(3).
 .TP
 .B solarisaio
 Solaris native asynchronous I/O.
@@ -646,459 +1452,554 @@
 Windows native asynchronous I/O. Default on Windows.
 .TP
 .B mmap
-File is memory mapped with \fBmmap\fR\|(2) and data copied using
-\fBmemcpy\fR\|(3).
+File is memory mapped with \fBmmap\fR\|(2) and data copied
+to/from using \fBmemcpy\fR\|(3).
 .TP
 .B splice
-\fBsplice\fR\|(2) is used to transfer the data and \fBvmsplice\fR\|(2) to
-transfer data from user-space to the kernel.
+\fBsplice\fR\|(2) is used to transfer the data and
+\fBvmsplice\fR\|(2) to transfer data from user space to the
+kernel.
 .TP
 .B sg
-SCSI generic sg v3 I/O. May be either synchronous using the SG_IO ioctl, or if
-the target is an sg character device, we use \fBread\fR\|(2) and
-\fBwrite\fR\|(2) for asynchronous I/O.
+SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
+ioctl, or if the target is an sg character device we use
+\fBread\fR\|(2) and \fBwrite\fR\|(2) for asynchronous
+I/O. Requires \fBfilename\fR option to specify either block or
+character devices.
 .TP
 .B null
-Doesn't transfer any data, just pretends to.  Mainly used to exercise \fBfio\fR
-itself and for debugging and testing purposes.
+Doesn't transfer any data, just pretends to. This is mainly used to
+exercise fio itself and for debugging/testing purposes.
 .TP
 .B net
-Transfer over the network.  The protocol to be used can be defined with the
-\fBprotocol\fR parameter.  Depending on the protocol, \fBfilename\fR,
-\fBhostname\fR, \fBport\fR, or \fBlisten\fR must be specified.
-This ioengine defines engine specific options.
+Transfer over the network to given `host:port'. Depending on the
+\fBprotocol\fR used, the \fBhostname\fR, \fBport\fR,
+\fBlisten\fR and \fBfilename\fR options are used to specify
+what sort of connection to make, while the \fBprotocol\fR option
+determines which protocol will be used. This engine defines engine
+specific options.
 .TP
 .B netsplice
-Like \fBnet\fR, but uses \fBsplice\fR\|(2) and \fBvmsplice\fR\|(2) to map data
-and send/receive. This ioengine defines engine specific options.
+Like \fBnet\fR, but uses \fBsplice\fR\|(2) and
+\fBvmsplice\fR\|(2) to map data and send/receive.
+This engine defines engine specific options.
 .TP
 .B cpuio
-Doesn't transfer any data, but burns CPU cycles according to \fBcpuload\fR and
-\fBcpuchunks\fR parameters. A job never finishes unless there is at least one
-non-cpuio job.
+Doesn't transfer any data, but burns CPU cycles according to the
+\fBcpuload\fR and \fBcpuchunks\fR options. Setting
+\fBcpuload\fR\=85 will cause that job to do nothing but burn 85%
+of the CPU. In case of SMP machines, use `numjobs=<nr_of_cpu>'
+to get desired CPU usage, as the cpuload only loads a
+single CPU at the desired rate. A job never finishes unless there is
+at least one non\-cpuio job.
 .TP
 .B guasi
-The GUASI I/O engine is the Generic Userspace Asynchronous Syscall Interface
-approach to asynchronous I/O.
-.br
-See <http://www.xmailserver.org/guasi\-lib.html>.
+The GUASI I/O engine is the Generic Userspace Asyncronous Syscall
+Interface approach to async I/O. See \fIhttp://www.xmailserver.org/guasi\-lib.html\fR
+for more info on GUASI.
 .TP
 .B rdma
-The RDMA I/O engine supports both RDMA memory semantics (RDMA_WRITE/RDMA_READ)
-and channel semantics (Send/Recv) for the InfiniBand, RoCE and iWARP protocols.
-.TP
-.B external
-Loads an external I/O engine object file.  Append the engine filename as
-`:\fIenginepath\fR'.
+The RDMA I/O engine supports both RDMA memory semantics
+(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
+InfiniBand, RoCE and iWARP protocols.
 .TP
 .B falloc
-   IO engine that does regular linux native fallocate call to simulate data
-transfer as fio ioengine
-.br
-  DDIR_READ  does fallocate(,mode = FALLOC_FL_KEEP_SIZE,)
-.br
-  DIR_WRITE does fallocate(,mode = 0)
-.br
-  DDIR_TRIM does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE)
+I/O engine that does regular fallocate to simulate data transfer as
+fio ioengine.
+.RS
+.P
+.PD 0
+DDIR_READ      does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
+.P
+DIR_WRITE      does fallocate(,mode = 0).
+.P
+DDIR_TRIM      does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
+.PD
+.RE
+.TP
+.B ftruncate
+I/O engine that sends \fBftruncate\fR\|(2) operations in response
+to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
+size to the current block offset. \fBblocksize\fR is ignored.
 .TP
 .B e4defrag
-IO engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate defragment activity
-request to DDIR_WRITE event
+I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+defragment activity in request to DDIR_WRITE event.
 .TP
 .B rbd
-IO engine supporting direct access to Ceph Rados Block Devices (RBD) via librbd
-without the need to use the kernel rbd driver. This ioengine defines engine specific
-options.
+I/O engine supporting direct access to Ceph Rados Block Devices
+(RBD) via librbd without the need to use the kernel rbd driver. This
+ioengine defines engine specific options.
 .TP
 .B gfapi
-Using Glusterfs libgfapi sync interface to direct access to Glusterfs volumes without
-having to go through FUSE. This ioengine defines engine specific
-options.
+Using GlusterFS libgfapi sync interface to direct access to
+GlusterFS volumes without having to go through FUSE. This ioengine
+defines engine specific options.
 .TP
 .B gfapi_async
-Using Glusterfs libgfapi async interface to direct access to Glusterfs volumes without
-having to go through FUSE. This ioengine defines engine specific
-options.
+Using GlusterFS libgfapi async interface to direct access to
+GlusterFS volumes without having to go through FUSE. This ioengine
+defines engine specific options.
 .TP
 .B libhdfs
-Read and write through Hadoop (HDFS).  The \fBfilename\fR option is used to
-specify host,port of the hdfs name-node to connect. This engine interprets
-offsets a little differently. In HDFS, files once created cannot be modified.
-So random writes are not possible. To imitate this, libhdfs engine expects
-bunch of small files to be created over HDFS, and engine will randomly pick a
-file out of those files based on the offset generated by fio backend. (see the
-example job file to create such files, use rw=write option). Please note, you
-might want to set necessary environment variables to work with hdfs/libhdfs
-properly.
+Read and write through Hadoop (HDFS). The \fBfilename\fR option
+is used to specify host,port of the hdfs name\-node to connect. This
+engine interprets offsets a little differently. In HDFS, files once
+created cannot be modified so random writes are not possible. To
+imitate this the libhdfs engine expects a bunch of small files to be
+created over HDFS and will randomly pick a file from them
+based on the offset generated by fio backend (see the example
+job file to create such files, use `rw=write' option). Please
+note, it may be necessary to set environment variables to work
+with HDFS/libhdfs properly. Each job uses its own connection to
+HDFS.
 .TP
 .B mtd
-Read, write and erase an MTD character device (e.g., /dev/mtd0). Discards are
-treated as erases. Depending on the underlying device type, the I/O may have
-to go in a certain pattern, e.g., on NAND, writing sequentially to erase blocks
-and discarding before overwriting. The writetrim mode works well for this
+Read, write and erase an MTD character device (e.g.,
+`/dev/mtd0'). Discards are treated as erases. Depending on the
+underlying device type, the I/O may have to go in a certain pattern,
+e.g., on NAND, writing sequentially to erase blocks and discarding
+before overwriting. The \fBtrimwrite\fR mode works well for this
 constraint.
 .TP
 .B pmemblk
-Read and write through the NVML libpmemblk interface.
+Read and write using filesystem DAX to a file on a filesystem
+mounted with DAX on a persistent memory device through the NVML
+libpmemblk library.
+.TP
+.B dev\-dax
+Read and write using device DAX to a persistent memory device (e.g.,
+/dev/dax0.0) through the NVML libpmem library.
 .TP
-.B dev-dax
-Read and write through a DAX device exposed from persistent memory.
-.RE
-.P
-.RE
-.TP
-.BI iodepth \fR=\fPint
-Number of I/O units to keep in flight against the file. Note that increasing
-iodepth beyond 1 will not affect synchronous ioengines (except for small
-degress when verify_async is in use). Even async engines may impose OS
-restrictions causing the desired depth not to be achieved.  This may happen on
-Linux when using libaio and not setting \fBdirect\fR=1, since buffered IO is
-not async on that OS. Keep an eye on the IO depth distribution in the
-fio output to verify that the achieved depth is as expected. Default: 1.
-.TP
-.BI iodepth_batch \fR=\fPint "\fR,\fP iodepth_batch_submit" \fR=\fPint
-This defines how many pieces of IO to submit at once. It defaults to 1
-which means that we submit each IO as soon as it is available, but can
-be raised to submit bigger batches of IO at the time. If it is set to 0
-the \fBiodepth\fR value will be used.
-.TP
-.BI iodepth_batch_complete_min \fR=\fPint "\fR,\fP iodepth_batch_complete" \fR=\fPint
-This defines how many pieces of IO to retrieve at once. It defaults to 1 which
- means that we'll ask for a minimum of 1 IO in the retrieval process from the
-kernel. The IO retrieval will go on until we hit the limit set by
-\fBiodepth_low\fR. If this variable is set to 0, then fio will always check for
-completed events before queuing more IO. This helps reduce IO latency, at the
-cost of more retrieval system calls.
+.B external
+Prefix to specify loading an external I/O engine object file. Append
+the engine filename, e.g. `ioengine=external:/tmp/foo.o' to load
+ioengine `foo.o' in `/tmp'. The path can be either
+absolute or relative. See `engines/skeleton_external.c' in the fio source for
+details of writing an external I/O engine.
+.SS "I/O engine specific parameters"
+In addition, there are some parameters which are only valid when a specific
+\fBioengine\fR is in use. These are used identically to normal parameters,
+with the caveat that when used on the command line, they must come after the
+\fBioengine\fR that defines them is selected.
 .TP
-.BI iodepth_batch_complete_max \fR=\fPint
-This defines maximum pieces of IO to
-retrieve at once. This variable should be used along with
-\fBiodepth_batch_complete_min\fR=int variable, specifying the range
-of min and max amount of IO which should be retrieved. By default
-it is equal to \fBiodepth_batch_complete_min\fR value.
-
-Example #1:
-.RS
-.RS
-\fBiodepth_batch_complete_min\fR=1
-.LP
-\fBiodepth_batch_complete_max\fR=<iodepth>
-.RE
-
-which means that we will retrieve at least 1 IO and up to the
-whole submitted queue depth. If none of IO has been completed
-yet, we will wait.
-
-Example #2:
-.RS
-\fBiodepth_batch_complete_min\fR=0
-.LP
-\fBiodepth_batch_complete_max\fR=<iodepth>
-.RE
-
-which means that we can retrieve up to the whole submitted
-queue depth, but if none of IO has been completed yet, we will
-NOT wait and immediately exit the system call. In this example
-we simply do polling.
-.RE
+.BI (libaio)userspace_reap
+Normally, with the libaio engine in use, fio will use the
+\fBio_getevents\fR\|(3) system call to reap newly returned events. With
+this flag turned on, the AIO ring will be read directly from user\-space to
+reap events. The reaping mode is only enabled when polling for a minimum of
+0 events (e.g. when `iodepth_batch_complete=0').
 .TP
-.BI iodepth_low \fR=\fPint
-Low watermark indicating when to start filling the queue again.  Default:
-\fBiodepth\fR.
+.BI (pvsync2)hipri
+Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+than normal.
 .TP
-.BI io_submit_mode \fR=\fPstr
-This option controls how fio submits the IO to the IO engine. The default is
-\fBinline\fR, which means that the fio job threads submit and reap IO directly.
-If set to \fBoffload\fR, the job threads will offload IO submission to a
-dedicated pool of IO threads. This requires some coordination and thus has a
-bit of extra overhead, especially for lower queue depth IO where it can
-increase latencies. The benefit is that fio can manage submission rates
-independently of the device completion rates. This avoids skewed latency
-reporting if IO gets back up on the device side (the coordinated omission
-problem).
+.BI (pvsync2)hipri_percentage
+When hipri is set this determines the probability of a pvsync2 I/O being high
+priority. The default is 100%.
 .TP
-.BI direct \fR=\fPbool
-If true, use non-buffered I/O (usually O_DIRECT).  Default: false.
+.BI (cpuio)cpuload \fR=\fPint
+Attempt to use the specified percentage of CPU cycles. This is a mandatory
+option when using cpuio I/O engine.
 .TP
-.BI atomic \fR=\fPbool
-If value is true, attempt to use atomic direct IO. Atomic writes are guaranteed
-to be stable once acknowledged by the operating system. Only Linux supports
-O_ATOMIC right now.
+.BI (cpuio)cpuchunks \fR=\fPint
+Split the load into cycles of the given time. In microseconds.
 .TP
-.BI buffered \fR=\fPbool
-If true, use buffered I/O.  This is the opposite of the \fBdirect\fR parameter.
-Default: true.
+.BI (cpuio)exit_on_io_done \fR=\fPbool
+Detect when I/O threads are done, then exit.
 .TP
-.BI offset \fR=\fPint
-Offset in the file to start I/O. Data before the offset will not be touched.
+.BI (libhdfs)namenode \fR=\fPstr
+The hostname or IP address of a HDFS cluster namenode to contact.
 .TP
-.BI offset_increment \fR=\fPint
-If this is provided, then the real offset becomes the
-offset + offset_increment * thread_number, where the thread number is a
-counter that starts at 0 and is incremented for each sub-job (i.e. when
-numjobs option is specified). This option is useful if there are several jobs
-which are intended to operate on a file in parallel disjoint segments, with
-even spacing between the starting points.
+.BI (libhdfs)port
+The listening port of the HFDS cluster namenode.
 .TP
-.BI number_ios \fR=\fPint
-Fio will normally perform IOs until it has exhausted the size of the region
-set by \fBsize\fR, or if it exhaust the allocated time (or hits an error
-condition). With this setting, the range/size can be set independently of
-the number of IOs to perform. When fio reaches this number, it will exit
-normally and report status. Note that this does not extend the amount
-of IO that will be done, it will only stop fio if this condition is met
-before other end-of-job criteria.
+.BI (netsplice,net)port
+The TCP or UDP port to bind to or connect to. If this is used with
+\fBnumjobs\fR to spawn multiple instances of the same job type, then
+this will be the starting port number since fio will use a range of
+ports.
 .TP
-.BI fsync \fR=\fPint
-How many I/Os to perform before issuing an \fBfsync\fR\|(2) of dirty data.  If
-0, don't sync.  Default: 0.
+.BI (netsplice,net)hostname \fR=\fPstr
+The hostname or IP address to use for TCP or UDP based I/O. If the job is
+a TCP listener or UDP reader, the hostname is not used and must be omitted
+unless it is a valid UDP multicast address.
+.TP
+.BI (netsplice,net)interface \fR=\fPstr
+The IP address of the network interface used to send or receive UDP
+multicast.
 .TP
-.BI fdatasync \fR=\fPint
-Like \fBfsync\fR, but uses \fBfdatasync\fR\|(2) instead to only sync the
-data parts of the file. Default: 0.
+.BI (netsplice,net)ttl \fR=\fPint
+Time\-to\-live value for outgoing UDP multicast packets. Default: 1.
 .TP
-.BI write_barrier \fR=\fPint
-Make every Nth write a barrier write.
+.BI (netsplice,net)nodelay \fR=\fPbool
+Set TCP_NODELAY on TCP connections.
 .TP
-.BI sync_file_range \fR=\fPstr:int
-Use \fBsync_file_range\fR\|(2) for every \fRval\fP number of write operations. Fio will
-track range of writes that have happened since the last \fBsync_file_range\fR\|(2) call.
-\fRstr\fP can currently be one or more of:
+.BI (netsplice,net)protocol \fR=\fPstr "\fR,\fP proto" \fR=\fPstr
+The network protocol to use. Accepted values are:
+.RS
 .RS
 .TP
-.B wait_before
-SYNC_FILE_RANGE_WAIT_BEFORE
+.B tcp
+Transmission control protocol.
 .TP
-.B write
-SYNC_FILE_RANGE_WRITE
+.B tcpv6
+Transmission control protocol V6.
 .TP
-.B wait_after
-SYNC_FILE_RANGE_WRITE
+.B udp
+User datagram protocol.
 .TP
+.B udpv6
+User datagram protocol V6.
+.TP
+.B unix
+UNIX domain socket.
 .RE
 .P
-So if you do sync_file_range=wait_before,write:8, fio would use
-\fBSYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE\fP for every 8 writes.
-Also see the \fBsync_file_range\fR\|(2) man page.  This option is Linux specific.
+When the protocol is TCP or UDP, the port must also be given, as well as the
+hostname if the job is a TCP listener or UDP reader. For unix sockets, the
+normal \fBfilename\fR option should be used and the port is invalid.
+.RE
+.TP
+.BI (netsplice,net)listen
+For TCP network connections, tell fio to listen for incoming connections
+rather than initiating an outgoing connection. The \fBhostname\fR must
+be omitted if this option is used.
+.TP
+.BI (netsplice,net)pingpong
+Normally a network writer will just continue writing data, and a network
+reader will just consume packages. If `pingpong=1' is set, a writer will
+send its normal payload to the reader, then wait for the reader to send the
+same payload back. This allows fio to measure network latencies. The
+submission and completion latencies then measure local time spent sending or
+receiving, and the completion latency measures how long it took for the
+other end to receive and send back. For UDP multicast traffic
+`pingpong=1' should only be set for a single reader when multiple readers
+are listening to the same address.
 .TP
-.BI overwrite \fR=\fPbool
-If writing, setup the file first and do overwrites.  Default: false.
+.BI (netsplice,net)window_size \fR=\fPint
+Set the desired socket buffer size for the connection.
 .TP
-.BI end_fsync \fR=\fPbool
-Sync file contents when a write stage has completed.  Default: false.
+.BI (netsplice,net)mss \fR=\fPint
+Set the TCP maximum segment size (TCP_MAXSEG).
 .TP
-.BI fsync_on_close \fR=\fPbool
-If true, sync file contents on close.  This differs from \fBend_fsync\fR in that
-it will happen on every close, not just at the end of the job.  Default: false.
+.BI (e4defrag)donorname \fR=\fPstr
+File will be used as a block donor (swap extents between files).
 .TP
-.BI rwmixread \fR=\fPint
-Percentage of a mixed workload that should be reads. Default: 50.
+.BI (e4defrag)inplace \fR=\fPint
+Configure donor file blocks allocation strategy:
+.RS
+.RS
 .TP
-.BI rwmixwrite \fR=\fPint
-Percentage of a mixed workload that should be writes.  If \fBrwmixread\fR and
-\fBrwmixwrite\fR are given and do not sum to 100%, the latter of the two
-overrides the first. This may interfere with a given rate setting, if fio is
-asked to limit reads or writes to a certain rate. If that is the case, then
-the distribution may be skewed. Default: 50.
+.B 0
+Default. Preallocate donor's file on init.
 .TP
-.BI random_distribution \fR=\fPstr:float
-By default, fio will use a completely uniform random distribution when asked
-to perform random IO. Sometimes it is useful to skew the distribution in
-specific ways, ensuring that some parts of the data is more hot than others.
-Fio includes the following distribution models:
-.RS
+.B 1
+Allocate space immediately inside defragment event, and free right
+after event.
+.RE
+.RE
 .TP
-.B random
-Uniform random distribution
+.BI (rbd)clustername \fR=\fPstr
+Specifies the name of the Ceph cluster.
 .TP
-.B zipf
-Zipf distribution
+.BI (rbd)rbdname \fR=\fPstr
+Specifies the name of the RBD.
 .TP
-.B pareto
-Pareto distribution
+.BI (rbd)pool \fR=\fPstr
+Specifies the name of the Ceph pool containing RBD.
 .TP
-.B gauss
-Normal (gaussian) distribution
+.BI (rbd)clientname \fR=\fPstr
+Specifies the username (without the 'client.' prefix) used to access the
+Ceph cluster. If the \fBclustername\fR is specified, the \fBclientname\fR shall be
+the full *type.id* string. If no type. prefix is given, fio will add 'client.'
+by default.
 .TP
-.B zoned
-Zoned random distribution
+.BI (mtd)skip_bad \fR=\fPbool
+Skip operations against known bad blocks.
 .TP
-.RE
-When using a \fBzipf\fR or \fBpareto\fR distribution, an input value is also
-needed to define the access pattern. For \fBzipf\fR, this is the zipf theta.
-For \fBpareto\fR, it's the pareto power. Fio includes a test program, genzipf,
-that can be used visualize what the given input values will yield in terms of
-hit rates. If you wanted to use \fBzipf\fR with a theta of 1.2, you would use
-random_distribution=zipf:1.2 as the option. If a non-uniform model is used,
-fio will disable use of the random map. For the \fBgauss\fR distribution, a
-normal deviation is supplied as a value between 0 and 100.
-.P
-.RS
-For a \fBzoned\fR distribution, fio supports specifying percentages of IO
-access that should fall within what range of the file or device. For example,
-given a criteria of:
-.P
-.RS
-60% of accesses should be to the first 10%
-.RE
+.BI (libhdfs)hdfsdirectory
+libhdfs will create chunk in this HDFS directory.
+.TP
+.BI (libhdfs)chunk_size
+The size of the chunk to use for each file.
+.SS "I/O depth"
+.TP
+.BI iodepth \fR=\fPint
+Number of I/O units to keep in flight against the file. Note that
+increasing \fBiodepth\fR beyond 1 will not affect synchronous ioengines (except
+for small degrees when \fBverify_async\fR is in use). Even async
+engines may impose OS restrictions causing the desired depth not to be
+achieved. This may happen on Linux when using libaio and not setting
+`direct=1', since buffered I/O is not async on that OS. Keep an
+eye on the I/O depth distribution in the fio output to verify that the
+achieved depth is as expected. Default: 1.
+.TP
+.BI iodepth_batch_submit \fR=\fPint "\fR,\fP iodepth_batch" \fR=\fPint
+This defines how many pieces of I/O to submit at once. It defaults to 1
+which means that we submit each I/O as soon as it is available, but can be
+raised to submit bigger batches of I/O at the time. If it is set to 0 the
+\fBiodepth\fR value will be used.
+.TP
+.BI iodepth_batch_complete_min \fR=\fPint "\fR,\fP iodepth_batch_complete" \fR=\fPint
+This defines how many pieces of I/O to retrieve at once. It defaults to 1
+which means that we'll ask for a minimum of 1 I/O in the retrieval process
+from the kernel. The I/O retrieval will go on until we hit the limit set by
+\fBiodepth_low\fR. If this variable is set to 0, then fio will always
+check for completed events before queuing more I/O. This helps reduce I/O
+latency, at the cost of more retrieval system calls.
+.TP
+.BI iodepth_batch_complete_max \fR=\fPint
+This defines maximum pieces of I/O to retrieve at once. This variable should
+be used along with \fBiodepth_batch_complete_min\fR=\fIint\fR variable,
+specifying the range of min and max amount of I/O which should be
+retrieved. By default it is equal to \fBiodepth_batch_complete_min\fR
+value. Example #1:
 .RS
-30% of accesses should be to the next 20%
-.RE
 .RS
-8% of accesses should be to to the next 30%
+.P
+.PD 0
+iodepth_batch_complete_min=1
+.P
+iodepth_batch_complete_max=<iodepth>
+.PD
 .RE
+.P
+which means that we will retrieve at least 1 I/O and up to the whole
+submitted queue depth. If none of I/O has been completed yet, we will wait.
+Example #2:
 .RS
-2% of accesses should be to the next 40%
-.RE
 .P
-we can define that through zoning of the random accesses. For the above
-example, the user would do:
+.PD 0
+iodepth_batch_complete_min=0
 .P
-.RS
-.B random_distribution=zoned:60/10:30/20:8/30:2/40
+iodepth_batch_complete_max=<iodepth>
+.PD
 .RE
 .P
-similarly to how \fBbssplit\fR works for setting ranges and percentages of block
-sizes. Like \fBbssplit\fR, it's possible to specify separate zones for reads,
-writes, and trims. If just one set is given, it'll apply to all of them.
+which means that we can retrieve up to the whole submitted queue depth, but
+if none of I/O has been completed yet, we will NOT wait and immediately exit
+the system call. In this example we simply do polling.
 .RE
 .TP
-.BI percentage_random \fR=\fPint
-For a random workload, set how big a percentage should be random. This defaults
-to 100%, in which case the workload is fully random. It can be set from
-anywhere from 0 to 100.  Setting it to 0 would make the workload fully
-sequential. It is possible to set different values for reads, writes, and
-trim. To do so, simply use a comma separated list. See \fBblocksize\fR.
+.BI iodepth_low \fR=\fPint
+The low water mark indicating when to start filling the queue
+again. Defaults to the same as \fBiodepth\fR, meaning that fio will
+attempt to keep the queue full at all times. If \fBiodepth\fR is set to
+e.g. 16 and \fBiodepth_low\fR is set to 4, then after fio has filled the queue of
+16 requests, it will let the depth drain down to 4 before starting to fill
+it again.
+.TP
+.BI serialize_overlap \fR=\fPbool
+Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+When two or more I/Os are submitted simultaneously, there is no guarantee that
+the I/Os will be processed or completed in the submitted order. Further, if
+two or more of those I/Os are writes, any overlapping region between them can
+become indeterminate/undefined on certain storage. These issues can cause
+verification to fail erratically when at least one of the racing I/Os is
+changing data and the overlapping region has a non-zero size. Setting
+\fBserialize_overlap\fR tells fio to avoid provoking this behavior by explicitly
+serializing in-flight I/Os that have a non-zero overlap. Note that setting
+this option can reduce both performance and the \fBiodepth\fR achieved.
+Additionally this option does not work when \fBio_submit_mode\fR is set to
+offload. Default: false.
 .TP
-.B norandommap
-Normally \fBfio\fR will cover every block of the file when doing random I/O. If
-this parameter is given, a new offset will be chosen without looking at past
-I/O history.  This parameter is mutually exclusive with \fBverify\fR.
+.BI io_submit_mode \fR=\fPstr
+This option controls how fio submits the I/O to the I/O engine. The default
+is `inline', which means that the fio job threads submit and reap I/O
+directly. If set to `offload', the job threads will offload I/O submission
+to a dedicated pool of I/O threads. This requires some coordination and thus
+has a bit of extra overhead, especially for lower queue depth I/O where it
+can increase latencies. The benefit is that fio can manage submission rates
+independently of the device completion rates. This avoids skewed latency
+reporting if I/O gets backed up on the device side (the coordinated omission
+problem).
+.SS "I/O rate"
 .TP
-.BI softrandommap \fR=\fPbool
-See \fBnorandommap\fR. If fio runs with the random block map enabled and it
-fails to allocate the map, if this option is set it will continue without a
-random block map. As coverage will not be as complete as with random maps, this
-option is disabled by default.
+.BI thinktime \fR=\fPtime
+Stall the job for the specified period of time after an I/O has completed before issuing the
+next. May be used to simulate processing being done by an application.
+When the unit is omitted, the value is interpreted in microseconds. See
+\fBthinktime_blocks\fR and \fBthinktime_spin\fR.
+.TP
+.BI thinktime_spin \fR=\fPtime
+Only valid if \fBthinktime\fR is set \- pretend to spend CPU time doing
+something with the data received, before falling back to sleeping for the
+rest of the period specified by \fBthinktime\fR. When the unit is
+omitted, the value is interpreted in microseconds.
 .TP
-.BI random_generator \fR=\fPstr
-Fio supports the following engines for generating IO offsets for random IO:
-.RS
+.BI thinktime_blocks \fR=\fPint
+Only valid if \fBthinktime\fR is set \- control how many blocks to issue,
+before waiting \fBthinktime\fR usecs. If not set, defaults to 1 which will make
+fio wait \fBthinktime\fR usecs after every block. This effectively makes any
+queue depth setting redundant, since no more than 1 I/O will be queued
+before we have to complete it and do our \fBthinktime\fR. In other words, this
+setting effectively caps the queue depth if the latter is larger.
+.TP
+.BI rate \fR=\fPint[,int][,int]
+Cap the bandwidth used by this job. The number is in bytes/sec, the normal
+suffix rules apply. Comma\-separated values may be specified for reads,
+writes, and trims as described in \fBblocksize\fR.
+.RS
+.P
+For example, using `rate=1m,500k' would limit reads to 1MiB/sec and writes to
+500KiB/sec. Capping only reads or writes can be done with `rate=,500k' or
+`rate=500k,' where the former will only limit writes (to 500KiB/sec) and the
+latter will only limit reads.
+.RE
+.TP
+.BI rate_min \fR=\fPint[,int][,int]
+Tell fio to do whatever it can to maintain at least this bandwidth. Failing
+to meet this requirement will cause the job to exit. Comma\-separated values
+may be specified for reads, writes, and trims as described in
+\fBblocksize\fR.
+.TP
+.BI rate_iops \fR=\fPint[,int][,int]
+Cap the bandwidth to this number of IOPS. Basically the same as
+\fBrate\fR, just specified independently of bandwidth. If the job is
+given a block size range instead of a fixed value, the smallest block size
+is used as the metric. Comma\-separated values may be specified for reads,
+writes, and trims as described in \fBblocksize\fR.
+.TP
+.BI rate_iops_min \fR=\fPint[,int][,int]
+If fio doesn't meet this rate of I/O, it will cause the job to exit.
+Comma\-separated values may be specified for reads, writes, and trims as
+described in \fBblocksize\fR.
 .TP
-.B tausworthe
-Strong 2^88 cycle random number generator
+.BI rate_process \fR=\fPstr
+This option controls how fio manages rated I/O submissions. The default is
+`linear', which submits I/O in a linear fashion with fixed delays between
+I/Os that gets adjusted based on I/O completion rates. If this is set to
+`poisson', fio will submit I/O based on a more real world random request
+flow, known as the Poisson process
+(\fIhttps://en.wikipedia.org/wiki/Poisson_point_process\fR). The lambda will be
+10^6 / IOPS for the given workload.
+.SS "I/O latency"
 .TP
-.B lfsr
-Linear feedback shift register generator
+.BI latency_target \fR=\fPtime
+If set, fio will attempt to find the max performance point that the given
+workload will run at while maintaining a latency below this target. When
+the unit is omitted, the value is interpreted in microseconds. See
+\fBlatency_window\fR and \fBlatency_percentile\fR.
 .TP
-.B tausworthe64
-Strong 64-bit 2^258 cycle random number generator
+.BI latency_window \fR=\fPtime
+Used with \fBlatency_target\fR to specify the sample window that the job
+is run at varying queue depths to test the performance. When the unit is
+omitted, the value is interpreted in microseconds.
 .TP
-.RE
-.P
-Tausworthe is a strong random number generator, but it requires tracking on the
-side if we want to ensure that blocks are only read or written once. LFSR
-guarantees that we never generate the same offset twice, and it's also less
-computationally expensive. It's not a true random generator, however, though
-for IO purposes it's typically good enough. LFSR only works with single block
-sizes, not with workloads that use multiple block sizes. If used with such a
-workload, fio may read or write some blocks multiple times. The default
-value is tausworthe, unless the required space exceeds 2^32 blocks. If it does,
-then tausworthe64 is selected automatically.
+.BI latency_percentile \fR=\fPfloat
+The percentage of I/Os that must fall within the criteria specified by
+\fBlatency_target\fR and \fBlatency_window\fR. If not set, this
+defaults to 100.0, meaning that all I/Os must be equal or below to the value
+set by \fBlatency_target\fR.
+.TP
+.BI max_latency \fR=\fPtime
+If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
+maximum latency. When the unit is omitted, the value is interpreted in
+microseconds.
 .TP
-.BI nice \fR=\fPint
-Run job with given nice value.  See \fBnice\fR\|(2).
+.BI rate_cycle \fR=\fPint
+Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number
+of milliseconds. Defaults to 1000.
+.SS "I/O replay"
 .TP
-.BI prio \fR=\fPint
-Set I/O priority value of this job between 0 (highest) and 7 (lowest).  See
-\fBionice\fR\|(1).
+.BI write_iolog \fR=\fPstr
+Write the issued I/O patterns to the specified file. See
+\fBread_iolog\fR. Specify a separate file for each job, otherwise the
+iologs will be interspersed and the file may be corrupt.
 .TP
-.BI prioclass \fR=\fPint
-Set I/O priority class.  See \fBionice\fR\|(1).
+.BI read_iolog \fR=\fPstr
+Open an iolog with the specified filename and replay the I/O patterns it
+contains. This can be used to store a workload and replay it sometime
+later. The iolog given may also be a blktrace binary file, which allows fio
+to replay a workload captured by blktrace. See
+\fBblktrace\fR\|(8) for how to capture such logging data. For blktrace
+replay, the file needs to be turned into a blkparse binary data file first
+(`blkparse <device> \-o /dev/null \-d file_for_fio.bin').
+.TP
+.BI replay_no_stall \fR=\fPbool
+When replaying I/O with \fBread_iolog\fR the default behavior is to
+attempt to respect the timestamps within the log and replay them with the
+appropriate delay between IOPS. By setting this variable fio will not
+respect the timestamps and attempt to replay them as fast as possible while
+still respecting ordering. The result is the same I/O pattern to a given
+device, but different timings.
 .TP
-.BI thinktime \fR=\fPint
-Stall job for given number of microseconds between issuing I/Os.
+.BI replay_redirect \fR=\fPstr
+While replaying I/O patterns using \fBread_iolog\fR the default behavior
+is to replay the IOPS onto the major/minor device that each IOP was recorded
+from. This is sometimes undesirable because on a different machine those
+major/minor numbers can map to a different device. Changing hardware on the
+same system can also result in a different major/minor mapping.
+\fBreplay_redirect\fR causes all I/Os to be replayed onto the single specified
+device regardless of the device it was recorded
+from. i.e. `replay_redirect=/dev/sdc' would cause all I/O
+in the blktrace or iolog to be replayed onto `/dev/sdc'. This means
+multiple devices will be replayed onto a single device, if the trace
+contains multiple devices. If you want multiple devices to be replayed
+concurrently to multiple redirected devices you must blkparse your trace
+into separate traces and replay them with independent fio invocations.
+Unfortunately this also breaks the strict time ordering between multiple
+device accesses.
 .TP
-.BI thinktime_spin \fR=\fPint
-Pretend to spend CPU time for given number of microseconds, sleeping the rest
-of the time specified by \fBthinktime\fR.  Only valid if \fBthinktime\fR is set.
+.BI replay_align \fR=\fPint
+Force alignment of I/O offsets and lengths in a trace to this power of 2
+value.
 .TP
-.BI thinktime_blocks \fR=\fPint
-Only valid if thinktime is set - control how many blocks to issue, before
-waiting \fBthinktime\fR microseconds. If not set, defaults to 1 which will
-make fio wait \fBthinktime\fR microseconds after every block. This
-effectively makes any queue depth setting redundant, since no more than 1 IO
-will be queued before we have to complete it and do our thinktime. In other
-words, this setting effectively caps the queue depth if the latter is larger.
-Default: 1.
-.TP
-.BI rate \fR=\fPint
-Cap bandwidth used by this job. The number is in bytes/sec, the normal postfix
-rules apply. You can use \fBrate\fR=500k to limit reads and writes to 500k each,
-or you can specify read and writes separately. Using \fBrate\fR=1m,500k would
-limit reads to 1MB/sec and writes to 500KB/sec. Capping only reads or writes
-can be done with \fBrate\fR=,500k or \fBrate\fR=500k,. The former will only
-limit writes (to 500KB/sec), the latter will only limit reads.
-.TP
-.BI rate_min \fR=\fPint
-Tell \fBfio\fR to do whatever it can to maintain at least the given bandwidth.
-Failing to meet this requirement will cause the job to exit. The same format
-as \fBrate\fR is used for read vs write separation.
-.TP
-.BI rate_iops \fR=\fPint
-Cap the bandwidth to this number of IOPS. Basically the same as rate, just
-specified independently of bandwidth. The same format as \fBrate\fR is used for
-read vs write separation. If \fBblocksize\fR is a range, the smallest block
-size is used as the metric.
-.TP
-.BI rate_iops_min \fR=\fPint
-If this rate of I/O is not met, the job will exit. The same format as \fBrate\fR
-is used for read vs write separation.
+.BI replay_scale \fR=\fPint
+Scale sector offsets down by this factor when replaying traces.
+.SS "Threads, processes and job synchronization"
 .TP
-.BI rate_process \fR=\fPstr
-This option controls how fio manages rated IO submissions. The default is
-\fBlinear\fR, which submits IO in a linear fashion with fixed delays between
-IOs that gets adjusted based on IO completion rates. If this is set to
-\fBpoisson\fR, fio will submit IO based on a more real world random request
-flow, known as the Poisson process
-(https://en.wikipedia.org/wiki/Poisson_process). The lambda will be
-10^6 / IOPS for the given workload.
+.BI thread
+Fio defaults to creating jobs by using fork, however if this option is
+given, fio will create jobs by using POSIX Threads' function
+\fBpthread_create\fR\|(3) to create threads instead.
 .TP
-.BI rate_cycle \fR=\fPint
-Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number of
-milliseconds.  Default: 1000ms.
+.BI wait_for \fR=\fPstr
+If set, the current job won't be started until all workers of the specified
+waitee job are done.
+.\" ignore blank line here from HOWTO as it looks normal without it
+\fBwait_for\fR operates on the job name basis, so there are a few
+limitations. First, the waitee must be defined prior to the waiter job
+(meaning no forward references). Second, if a job is being referenced as a
+waitee, it must have a unique name (no duplicate waitees).
 .TP
-.BI latency_target \fR=\fPint
-If set, fio will attempt to find the max performance point that the given
-workload will run at while maintaining a latency below this target. The
-values is given in microseconds. See \fBlatency_window\fR and
-\fBlatency_percentile\fR.
+.BI nice \fR=\fPint
+Run the job with the given nice value. See man \fBnice\fR\|(2).
+.\" ignore blank line here from HOWTO as it looks normal without it
+On Windows, values less than \-15 set the process class to "High"; \-1 through
+\-15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
+priority class.
 .TP
-.BI latency_window \fR=\fPint
-Used with \fBlatency_target\fR to specify the sample window that the job
-is run at varying queue depths to test the performance. The value is given
-in microseconds.
+.BI prio \fR=\fPint
+Set the I/O priority value of this job. Linux limits us to a positive value
+between 0 and 7, with 0 being the highest. See man
+\fBionice\fR\|(1). Refer to an appropriate manpage for other operating
+systems since meaning of priority may differ.
 .TP
-.BI latency_percentile \fR=\fPfloat
-The percentage of IOs that must fall within the criteria specified by
-\fBlatency_target\fR and \fBlatency_window\fR. If not set, this defaults
-to 100.0, meaning that all IOs must be equal or below to the value set
-by \fBlatency_target\fR.
-.TP
-.BI max_latency \fR=\fPint
-If set, fio will exit the job if it exceeds this maximum latency. It will exit
-with an ETIME error.
+.BI prioclass \fR=\fPint
+Set the I/O priority class. See man \fBionice\fR\|(1).
 .TP
 .BI cpumask \fR=\fPint
-Set CPU affinity for this job. \fIint\fR is a bitmask of allowed CPUs the job
-may run on.  See \fBsched_setaffinity\fR\|(2).
+Set the CPU affinity of this job. The parameter given is a bit mask of
+allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
+and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
+\fBsched_setaffinity\fR\|(2). This may not work on all supported
+operating systems or kernel versions. This option doesn't work well for a
+higher CPU count than what you can store in an integer mask, so it can only
+control cpus 1\-32. For boxes with larger CPU counts, use
+\fBcpus_allowed\fR.
 .TP
 .BI cpus_allowed \fR=\fPstr
-Same as \fBcpumask\fR, but allows a comma-delimited list of CPU numbers.
+Controls the same options as \fBcpumask\fR, but accepts a textual
+specification of the permitted CPUs instead. So to use CPUs 1 and 5 you
+would specify `cpus_allowed=1,5'. This option also allows a range of CPUs
+to be specified \-\- say you wanted a binding to CPUs 1, 5, and 8 to 15, you
+would set `cpus_allowed=1,5,8\-15'.
 .TP
 .BI cpus_allowed_policy \fR=\fPstr
-Set the policy of how fio distributes the CPUs specified by \fBcpus_allowed\fR
-or \fBcpumask\fR. Two policies are supported:
+Set the policy of how fio distributes the CPUs specified by
+\fBcpus_allowed\fR or \fBcpumask\fR. Two policies are supported:
 .RS
 .RS
 .TP
@@ -1109,827 +2010,711 @@
 Each job will get a unique CPU from the CPU set.
 .RE
 .P
-\fBshared\fR is the default behaviour, if the option isn't specified. If
-\fBsplit\fR is specified, then fio will assign one cpu per job. If not enough
-CPUs are given for the jobs listed, then fio will roundrobin the CPUs in
-the set.
+\fBshared\fR is the default behavior, if the option isn't specified. If
+\fBsplit\fR is specified, then fio will will assign one cpu per job. If not
+enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
+in the set.
 .RE
-.P
 .TP
 .BI numa_cpu_nodes \fR=\fPstr
 Set this job running on specified NUMA nodes' CPUs. The arguments allow
-comma delimited list of cpu numbers, A-B ranges, or 'all'.
+comma delimited list of cpu numbers, A\-B ranges, or `all'. Note, to enable
+NUMA options support, fio must be built on a system with libnuma\-dev(el)
+installed.
 .TP
 .BI numa_mem_policy \fR=\fPstr
-Set this job's memory policy and corresponding NUMA nodes. Format of
-the arguments:
+Set this job's memory policy and corresponding NUMA nodes. Format of the
+arguments:
 .RS
-.TP
-.B <mode>[:<nodelist>]
-.TP
-.B mode
-is one of the following memory policy:
-.TP
-.B default, prefer, bind, interleave, local
-.TP
+.RS
+.P
+<mode>[:<nodelist>]
+.RE
+.P
+`mode' is one of the following memory poicies: `default', `prefer',
+`bind', `interleave' or `local'. For `default' and `local' memory
+policies, no node needs to be specified. For `prefer', only one node is
+allowed. For `bind' and `interleave' the `nodelist' may be as
+follows: a comma delimited list of numbers, A\-B ranges, or `all'.
 .RE
-For \fBdefault\fR and \fBlocal\fR memory policy, no \fBnodelist\fR is
-needed to be specified. For \fBprefer\fR, only one node is
-allowed. For \fBbind\fR and \fBinterleave\fR, \fBnodelist\fR allows
-comma delimited list of numbers, A-B ranges, or 'all'.
-.TP
-.BI startdelay \fR=\fPirange
-Delay start of job for the specified number of seconds. Supports all time
-suffixes to allow specification of hours, minutes, seconds and
-milliseconds - seconds are the default if a unit is omitted.
-Can be given as a range which causes each thread to choose randomly out of the
-range.
-.TP
-.BI runtime \fR=\fPint
-Terminate processing after the specified number of seconds.
-.TP
-.B time_based
-If given, run for the specified \fBruntime\fR duration even if the files are
-completely read or written. The same workload will be repeated as many times
-as \fBruntime\fR allows.
-.TP
-.BI ramp_time \fR=\fPint
-If set, fio will run the specified workload for this amount of time before
-logging any performance numbers. Useful for letting performance settle before
-logging results, thus minimizing the runtime required for stable results. Note
-that the \fBramp_time\fR is considered lead in time for a job, thus it will
-increase the total runtime if a special timeout or runtime is specified.
 .TP
-.BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float
-Define the criterion and limit for assessing steady state performance. The
-first parameter designates the criterion whereas the second parameter sets the
-threshold. When the criterion falls below the threshold for the specified
-duration, the job will stop. For example, iops_slope:0.1% will direct fio
-to terminate the job when the least squares regression slope falls below 0.1%
-of the mean IOPS. If group_reporting is enabled this will apply to all jobs in
-the group. All assessments are carried out using only data from the rolling
-collection window. Threshold limits can be expressed as a fixed value or as a
-percentage of the mean in the collection window. Below are the available steady
-state assessment criteria.
+.BI cgroup \fR=\fPstr
+Add job to this control group. If it doesn't exist, it will be created. The
+system must have a mounted cgroup blkio mount point for this to work. If
+your system doesn't have it mounted, you can do so with:
 .RS
 .RS
-.TP
-.B iops
-Collect IOPS data. Stop the job if all individual IOPS measurements are within
-the specified limit of the mean IOPS (e.g., iops:2 means that all individual
-IOPS values must be within 2 of the mean, whereas iops:0.2% means that all
-individual IOPS values must be within 0.2% of the mean IOPS to terminate the
-job).
-.TP
-.B iops_slope
-Collect IOPS data and calculate the least squares regression slope. Stop the
-job if the slope falls below the specified limit.
-.TP
-.B bw
-Collect bandwidth data. Stop the job if all individual bandwidth measurements
-are within the specified limit of the mean bandwidth.
-.TP
-.B bw_slope
-Collect bandwidth data and calculate the least squares regression slope. Stop
-the job if the slope falls below the specified limit.
+.P
+# mount \-t cgroup \-o blkio none /cgroup
 .RE
 .RE
 .TP
-.BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime
-A rolling window of this duration will be used to judge whether steady state
-has been reached. Data will be collected once per second. The default is 0
-which disables steady state detection.
-.TP
-.BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime
-Allow the job to run for the specified duration before beginning data collection
-for checking the steady state job termination criterion. The default is 0.
-.TP
-.BI invalidate \fR=\fPbool
-Invalidate buffer-cache for the file prior to starting I/O.  Default: true.
+.BI cgroup_weight \fR=\fPint
+Set the weight of the cgroup to this value. See the documentation that comes
+with the kernel, allowed values are in the range of 100..1000.
 .TP
-.BI sync \fR=\fPbool
-Use synchronous I/O for buffered writes.  For the majority of I/O engines,
-this means using O_SYNC.  Default: false.
+.BI cgroup_nodelete \fR=\fPbool
+Normally fio will delete the cgroups it has created after the job
+completion. To override this behavior and to leave cgroups around after the
+job completion, set `cgroup_nodelete=1'. This can be useful if one wants
+to inspect various cgroup files after job completion. Default: false.
 .TP
-.BI iomem \fR=\fPstr "\fR,\fP mem" \fR=\fPstr
-Allocation method for I/O unit buffer.  Allowed values are:
-.RS
-.RS
+.BI flow_id \fR=\fPint
+The ID of the flow. If not specified, it defaults to being a global
+flow. See \fBflow\fR.
 .TP
-.B malloc
-Allocate memory with \fBmalloc\fR\|(3). Default memory type.
+.BI flow \fR=\fPint
+Weight in token\-based flow control. If this value is used, then there is
+a 'flow counter' which is used to regulate the proportion of activity between
+two or more jobs. Fio attempts to keep this flow counter near zero. The
+\fBflow\fR parameter stands for how much should be added or subtracted to the
+flow counter on each iteration of the main I/O loop. That is, if one job has
+`flow=8' and another job has `flow=\-1', then there will be a roughly 1:8
+ratio in how much one runs vs the other.
 .TP
-.B shm
-Use shared memory buffers allocated through \fBshmget\fR\|(2).
+.BI flow_watermark \fR=\fPint
+The maximum value that the absolute value of the flow counter is allowed to
+reach before the job must wait for a lower value of the counter.
 .TP
-.B shmhuge
-Same as \fBshm\fR, but use huge pages as backing.
+.BI flow_sleep \fR=\fPint
+The period of time, in microseconds, to wait after the flow watermark has
+been exceeded before retrying operations.
 .TP
-.B mmap
-Use \fBmmap\fR\|(2) for allocation.  Uses anonymous memory unless a filename
-is given after the option in the format `:\fIfile\fR'.
+.BI stonewall "\fR,\fB wait_for_previous"
+Wait for preceding jobs in the job file to exit, before starting this
+one. Can be used to insert serialization points in the job file. A stone
+wall also implies starting a new reporting group, see
+\fBgroup_reporting\fR.
+.TP
+.BI exitall
+By default, fio will continue running all other jobs when one job finishes
+but sometimes this is not the desired action. Setting \fBexitall\fR will
+instead make fio terminate all other jobs when one job finishes.
 .TP
-.B mmaphuge
-Same as \fBmmap\fR, but use huge files as backing.
+.BI exec_prerun \fR=\fPstr
+Before running this job, issue the command specified through
+\fBsystem\fR\|(3). Output is redirected in a file called `jobname.prerun.txt'.
 .TP
-.B mmapshared
-Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
-.RE
-.P
-The amount of memory allocated is the maximum allowed \fBblocksize\fR for the
-job multiplied by \fBiodepth\fR.  For \fBshmhuge\fR or \fBmmaphuge\fR to work,
-the system must have free huge pages allocated.  \fBmmaphuge\fR also needs to
-have hugetlbfs mounted, and \fIfile\fR must point there. At least on Linux,
-huge pages must be manually allocated. See \fB/proc/sys/vm/nr_hugehages\fR
-and the documentation for that. Normally you just need to echo an appropriate
-number, eg echoing 8 will ensure that the OS has 8 huge pages ready for
-use.
-.RE
+.BI exec_postrun \fR=\fPstr
+After the job completes, issue the command specified though
+\fBsystem\fR\|(3). Output is redirected in a file called `jobname.postrun.txt'.
 .TP
-.BI iomem_align \fR=\fPint "\fR,\fP mem_align" \fR=\fPint
-This indicates the memory alignment of the IO memory buffers. Note that the
-given alignment is applied to the first IO unit buffer, if using \fBiodepth\fR
-the alignment of the following buffers are given by the \fBbs\fR used. In
-other words, if using a \fBbs\fR that is a multiple of the page sized in the
-system, all buffers will be aligned to this value. If using a \fBbs\fR that
-is not page aligned, the alignment of subsequent IO memory buffers is the
-sum of the \fBiomem_align\fR and \fBbs\fR used.
+.BI uid \fR=\fPint
+Instead of running as the invoking user, set the user ID to this value
+before the thread/process does any work.
 .TP
-.BI hugepage\-size \fR=\fPint
-Defines the size of a huge page.  Must be at least equal to the system setting.
-Should be a multiple of 1MB. Default: 4MB.
+.BI gid \fR=\fPint
+Set group ID, see \fBuid\fR.
+.SS "Verification"
 .TP
-.B exitall
-Terminate all jobs when one finishes.  Default: wait for each job to finish.
+.BI verify_only
+Do not perform specified workload, only verify data still matches previous
+invocation of this workload. This option allows one to check data multiple
+times at a later date without overwriting it. This option makes sense only
+for workloads that write data, and does not support workloads with the
+\fBtime_based\fR option set.
 .TP
-.B exitall_on_error \fR=\fPbool
-Terminate all jobs if one job finishes in error.  Default: wait for each job
-to finish.
+.BI do_verify \fR=\fPbool
+Run the verify phase after a write phase. Only valid if \fBverify\fR is
+set. Default: true.
 .TP
-.BI bwavgtime \fR=\fPint
-Average bandwidth calculations over the given time in milliseconds. If the job
-also does bandwidth logging through \fBwrite_bw_log\fR, then the minimum of
-this option and \fBlog_avg_msec\fR will be used.  Default: 500ms.
+.BI verify \fR=\fPstr
+If writing to a file, fio can verify the file contents after each iteration
+of the job. Each verification method also implies verification of special
+header, which is written to the beginning of each block. This header also
+includes meta information, like offset of the block, block number, timestamp
+when block was written, etc. \fBverify\fR can be combined with
+\fBverify_pattern\fR option. The allowed values are:
+.RS
+.RS
 .TP
-.BI iopsavgtime \fR=\fPint
-Average IOPS calculations over the given time in milliseconds. If the job
-also does IOPS logging through \fBwrite_iops_log\fR, then the minimum of
-this option and \fBlog_avg_msec\fR will be used.  Default: 500ms.
+.B md5
+Use an md5 sum of the data area and store it in the header of
+each block.
 .TP
-.BI create_serialize \fR=\fPbool
-If true, serialize file creation for the jobs.  Default: true.
+.B crc64
+Use an experimental crc64 sum of the data area and store it in the
+header of each block.
 .TP
-.BI create_fsync \fR=\fPbool
-\fBfsync\fR\|(2) data file after creation.  Default: true.
+.B crc32c
+Use a crc32c sum of the data area and store it in the header of
+each block. This will automatically use hardware acceleration
+(e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
+fall back to software crc32c if none is found. Generally the
+fatest checksum fio supports when hardware accelerated.
 .TP
-.BI create_on_open \fR=\fPbool
-If true, the files are not created until they are opened for IO by the job.
+.B crc32c\-intel
+Synonym for crc32c.
 .TP
-.BI create_only \fR=\fPbool
-If true, fio will only run the setup phase of the job. If files need to be
-laid out or updated on disk, only that will be done. The actual job contents
-are not executed.
+.B crc32
+Use a crc32 sum of the data area and store it in the header of each
+block.
 .TP
-.BI allow_file_create \fR=\fPbool
-If true, fio is permitted to create files as part of its workload. This is
-the default behavior. If this option is false, then fio will error out if the
-files it needs to use don't already exist. Default: true.
+.B crc16
+Use a crc16 sum of the data area and store it in the header of each
+block.
 .TP
-.BI allow_mounted_write \fR=\fPbool
-If this isn't set, fio will abort jobs that are destructive (eg that write)
-to what appears to be a mounted device or partition. This should help catch
-creating inadvertently destructive tests, not realizing that the test will
-destroy data on the mounted file system. Default: false.
+.B crc7
+Use a crc7 sum of the data area and store it in the header of each
+block.
 .TP
-.BI pre_read \fR=\fPbool
-If this is given, files will be pre-read into memory before starting the given
-IO operation. This will also clear the \fR \fBinvalidate\fR flag, since it is
-pointless to pre-read and then drop the cache. This will only work for IO
-engines that are seekable, since they allow you to read the same data
-multiple times. Thus it will not work on eg network or splice IO.
+.B xxhash
+Use xxhash as the checksum function. Generally the fastest software
+checksum that fio supports.
 .TP
-.BI unlink \fR=\fPbool
-Unlink job files when done.  Default: false.
+.B sha512
+Use sha512 as the checksum function.
 .TP
-.BI unlink_each_loop \fR=\fPbool
-Unlink job files after each iteration or loop.  Default: false.
+.B sha256
+Use sha256 as the checksum function.
 .TP
-.BI loops \fR=\fPint
-Specifies the number of iterations (runs of the same workload) of this job.
-Default: 1.
+.B sha1
+Use optimized sha1 as the checksum function.
 .TP
-.BI verify_only \fR=\fPbool
-Do not perform the specified workload, only verify data still matches previous
-invocation of this workload. This option allows one to check data multiple
-times at a later date without overwriting it. This option makes sense only for
-workloads that write data, and does not support workloads with the
-\fBtime_based\fR option set.
+.B sha3\-224
+Use optimized sha3\-224 as the checksum function.
 .TP
-.BI do_verify \fR=\fPbool
-Run the verify phase after a write phase.  Only valid if \fBverify\fR is set.
-Default: true.
+.B sha3\-256
+Use optimized sha3\-256 as the checksum function.
 .TP
-.BI verify \fR=\fPstr
-Method of verifying file contents after each iteration of the job. Each
-verification method also implies verification of special header, which is
-written to the beginning of each block. This header also includes meta
-information, like offset of the block, block number, timestamp when block
-was written, etc.  \fBverify\fR=str can be combined with \fBverify_pattern\fR=str
-option.  The allowed values are:
-.RS
-.RS
+.B sha3\-384
+Use optimized sha3\-384 as the checksum function.
 .TP
-.B md5 crc16 crc32 crc32c crc32c-intel crc64 crc7 sha256 sha512 sha1 xxhash
-Store appropriate checksum in the header of each block. crc32c-intel is
-hardware accelerated SSE4.2 driven, falls back to regular crc32c if
-not supported by the system.
+.B sha3\-512
+Use optimized sha3\-512 as the checksum function.
 .TP
 .B meta
-This option is deprecated, since now meta information is included in generic
-verification header and meta verification happens by default.  For detailed
-information see the description of the \fBverify\fR=str setting. This option
-is kept because of compatibility's sake with old configurations. Do not use it.
+This option is deprecated, since now meta information is included in
+generic verification header and meta verification happens by
+default. For detailed information see the description of the
+\fBverify\fR setting. This option is kept because of
+compatibility's sake with old configurations. Do not use it.
 .TP
 .B pattern
-Verify a strict pattern. Normally fio includes a header with some basic
-information and checksumming, but if this option is set, only the
-specific pattern set with \fBverify_pattern\fR is verified.
+Verify a strict pattern. Normally fio includes a header with some
+basic information and checksumming, but if this option is set, only
+the specific pattern set with \fBverify_pattern\fR is verified.
 .TP
 .B null
-Pretend to verify.  Used for testing internals.
+Only pretend to verify. Useful for testing internals with
+`ioengine=null', not for much else.
 .RE
-
-This option can be used for repeated burn-in tests of a system to make sure
-that the written data is also correctly read back. If the data direction given
-is a read or random read, fio will assume that it should verify a previously
-written file. If the data direction includes any form of write, the verify will
-be of the newly written data.
+.P
+This option can be used for repeated burn\-in tests of a system to make sure
+that the written data is also correctly read back. If the data direction
+given is a read or random read, fio will assume that it should verify a
+previously written file. If the data direction includes any form of write,
+the verify will be of the newly written data.
 .RE
 .TP
 .BI verifysort \fR=\fPbool
-If true, written verify blocks are sorted if \fBfio\fR deems it to be faster to
-read them back in a sorted manner.  Default: true.
+If true, fio will sort written verify blocks when it deems it faster to read
+them back in a sorted manner. This is often the case when overwriting an
+existing file, since the blocks are already laid out in the file system. You
+can ignore this option unless doing huge amounts of really fast I/O where
+the red\-black tree sorting CPU time becomes significant. Default: true.
 .TP
 .BI verifysort_nr \fR=\fPint
-Pre-load and sort verify blocks for a read workload.
+Pre\-load and sort verify blocks for a read workload.
 .TP
 .BI verify_offset \fR=\fPint
 Swap the verification header with data somewhere else in the block before
-writing.  It is swapped back before verifying.
+writing. It is swapped back before verifying.
 .TP
 .BI verify_interval \fR=\fPint
-Write the verification header for this number of bytes, which should divide
-\fBblocksize\fR.  Default: \fBblocksize\fR.
+Write the verification header at a finer granularity than the
+\fBblocksize\fR. It will be written for chunks the size of
+\fBverify_interval\fR. \fBblocksize\fR should divide this evenly.
 .TP
 .BI verify_pattern \fR=\fPstr
-If set, fio will fill the io buffers with this pattern. Fio defaults to filling
-with totally random bytes, but sometimes it's interesting to fill with a known
-pattern for io verification purposes. Depending on the width of the pattern,
-fio will fill 1/2/3/4 bytes of the buffer at the time(it can be either a
-decimal or a hex number). The verify_pattern if larger than a 32-bit quantity
-has to be a hex number that starts with either "0x" or "0X". Use with
-\fBverify\fP=str. Also, verify_pattern supports %o format, which means that for
-each block offset will be written and then verified back, e.g.:
+If set, fio will fill the I/O buffers with this pattern. Fio defaults to
+filling with totally random bytes, but sometimes it's interesting to fill
+with a known pattern for I/O verification purposes. Depending on the width
+of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
+be either a decimal or a hex number). The \fBverify_pattern\fR if larger than
+a 32\-bit quantity has to be a hex number that starts with either "0x" or
+"0X". Use with \fBverify\fR. Also, \fBverify_pattern\fR supports %o
+format, which means that for each block offset will be written and then
+verified back, e.g.:
 .RS
 .RS
-\fBverify_pattern\fR=%o
+.P
+verify_pattern=%o
 .RE
+.P
 Or use combination of everything:
-.LP
 .RS
-\fBverify_pattern\fR=0xff%o"abcd"-21
+.P
+verify_pattern=0xff%o"abcd"\-12
 .RE
 .RE
 .TP
 .BI verify_fatal \fR=\fPbool
-If true, exit the job on the first observed verification failure.  Default:
-false.
+Normally fio will keep checking the entire contents before quitting on a
+block verification failure. If this option is set, fio will exit the job on
+the first observed failure. Default: false.
 .TP
 .BI verify_dump \fR=\fPbool
-If set, dump the contents of both the original data block and the data block we
-read off disk to files. This allows later analysis to inspect just what kind of
-data corruption occurred. Off by default.
+If set, dump the contents of both the original data block and the data block
+we read off disk to files. This allows later analysis to inspect just what
+kind of data corruption occurred. Off by default.
 .TP
 .BI verify_async \fR=\fPint
-Fio will normally verify IO inline from the submitting thread. This option
-takes an integer describing how many async offload threads to create for IO
-verification instead, causing fio to offload the duty of verifying IO contents
-to one or more separate threads.  If using this offload option, even sync IO
-engines can benefit from using an \fBiodepth\fR setting higher than 1, as it
-allows them to have IO in flight while verifies are running.
+Fio will normally verify I/O inline from the submitting thread. This option
+takes an integer describing how many async offload threads to create for I/O
+verification instead, causing fio to offload the duty of verifying I/O
+contents to one or more separate threads. If using this offload option, even
+sync I/O engines can benefit from using an \fBiodepth\fR setting higher
+than 1, as it allows them to have I/O in flight while verifies are running.
+Defaults to 0 async threads, i.e. verification is not asynchronous.
 .TP
 .BI verify_async_cpus \fR=\fPstr
-Tell fio to set the given CPU affinity on the async IO verification threads.
-See \fBcpus_allowed\fP for the format used.
+Tell fio to set the given CPU affinity on the async I/O verification
+threads. See \fBcpus_allowed\fR for the format used.
 .TP
 .BI verify_backlog \fR=\fPint
 Fio will normally verify the written contents of a job that utilizes verify
 once that job has completed. In other words, everything is written then
 everything is read back and verified. You may want to verify continually
-instead for a variety of reasons. Fio stores the meta data associated with an
-IO block in memory, so for large verify workloads, quite a bit of memory would
-be used up holding this meta data. If this option is enabled, fio will write
-only N blocks before verifying these blocks.
+instead for a variety of reasons. Fio stores the meta data associated with
+an I/O block in memory, so for large verify workloads, quite a bit of memory
+would be used up holding this meta data. If this option is enabled, fio will
+write only N blocks before verifying these blocks.
 .TP
 .BI verify_backlog_batch \fR=\fPint
-Control how many blocks fio will verify if verify_backlog is set. If not set,
-will default to the value of \fBverify_backlog\fR (meaning the entire queue is
-read back and verified).  If \fBverify_backlog_batch\fR is less than
-\fBverify_backlog\fR then not all blocks will be verified,  if
-\fBverify_backlog_batch\fR is larger than \fBverify_backlog\fR,  some blocks
-will be verified more than once.
+Control how many blocks fio will verify if \fBverify_backlog\fR is
+set. If not set, will default to the value of \fBverify_backlog\fR
+(meaning the entire queue is read back and verified). If
+\fBverify_backlog_batch\fR is less than \fBverify_backlog\fR then not all
+blocks will be verified, if \fBverify_backlog_batch\fR is larger than
+\fBverify_backlog\fR, some blocks will be verified more than once.
+.TP
+.BI verify_state_save \fR=\fPbool
+When a job exits during the write phase of a verify workload, save its
+current state. This allows fio to replay up until that point, if the verify
+state is loaded for the verify read phase. The format of the filename is,
+roughly:
+.RS
+.RS
+.P
+<type>\-<jobname>\-<jobindex>\-verify.state.
+.RE
+.P
+<type> is "local" for a local run, "sock" for a client/server socket
+connection, and "ip" (192.168.0.1, for instance) for a networked
+client/server connection. Defaults to true.
+.RE
+.TP
+.BI verify_state_load \fR=\fPbool
+If a verify termination trigger was used, fio stores the current write state
+of each thread. This can be used at verification time so that fio knows how
+far it should verify. Without this information, fio will run a full
+verification pass, according to the settings in the job file used. Default
+false.
 .TP
 .BI trim_percentage \fR=\fPint
 Number of verify blocks to discard/trim.
 .TP
 .BI trim_verify_zero \fR=\fPbool
-Verify that trim/discarded blocks are returned as zeroes.
+Verify that trim/discarded blocks are returned as zeros.
 .TP
 .BI trim_backlog \fR=\fPint
-Trim after this number of blocks are written.
+Verify that trim/discarded blocks are returned as zeros.
 .TP
 .BI trim_backlog_batch \fR=\fPint
-Trim this number of IO blocks.
+Trim this number of I/O blocks.
 .TP
 .BI experimental_verify \fR=\fPbool
 Enable experimental verification.
+.SS "Steady state"
 .TP
-.BI verify_state_save \fR=\fPbool
-When a job exits during the write phase of a verify workload, save its
-current state. This allows fio to replay up until that point, if the
-verify state is loaded for the verify read phase.
-.TP
-.BI verify_state_load \fR=\fPbool
-If a verify termination trigger was used, fio stores the current write
-state of each thread. This can be used at verification time so that fio
-knows how far it should verify. Without this information, fio will run
-a full verification pass, according to the settings in the job file used.
-.TP
-.B stonewall "\fR,\fP wait_for_previous"
-Wait for preceding jobs in the job file to exit before starting this one.
-\fBstonewall\fR implies \fBnew_group\fR.
-.TP
-.B new_group
-Start a new reporting group.  If not given, all jobs in a file will be part
-of the same reporting group, unless separated by a stonewall.
-.TP
-.BI numjobs \fR=\fPint
-Number of clones (processes/threads performing the same workload) of this job.
-Default: 1.
-.TP
-.B group_reporting
-If set, display per-group reports instead of per-job when \fBnumjobs\fR is
-specified.
-.TP
-.B thread
-Use threads created with \fBpthread_create\fR\|(3) instead of processes created
-with \fBfork\fR\|(2).
-.TP
-.BI zonesize \fR=\fPint
-Divide file into zones of the specified size in bytes.  See \fBzoneskip\fR.
-.TP
-.BI zonerange \fR=\fPint
-Give size of an IO zone.  See \fBzoneskip\fR.
-.TP
-.BI zoneskip \fR=\fPint
-Skip the specified number of bytes when \fBzonesize\fR bytes of data have been
-read.
+.BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float
+Define the criterion and limit for assessing steady state performance. The
+first parameter designates the criterion whereas the second parameter sets
+the threshold. When the criterion falls below the threshold for the
+specified duration, the job will stop. For example, `iops_slope:0.1%' will
+direct fio to terminate the job when the least squares regression slope
+falls below 0.1% of the mean IOPS. If \fBgroup_reporting\fR is enabled
+this will apply to all jobs in the group. Below is the list of available
+steady state assessment criteria. All assessments are carried out using only
+data from the rolling collection window. Threshold limits can be expressed
+as a fixed value or as a percentage of the mean in the collection window.
+.RS
+.RS
 .TP
-.BI write_iolog \fR=\fPstr
-Write the issued I/O patterns to the specified file.  Specify a separate file
-for each job, otherwise the iologs will be interspersed and the file may be
-corrupt.
+.B iops
+Collect IOPS data. Stop the job if all individual IOPS measurements
+are within the specified limit of the mean IOPS (e.g., `iops:2'
+means that all individual IOPS values must be within 2 of the mean,
+whereas `iops:0.2%' means that all individual IOPS values must be
+within 0.2% of the mean IOPS to terminate the job).
 .TP
-.BI read_iolog \fR=\fPstr
-Replay the I/O patterns contained in the specified file generated by
-\fBwrite_iolog\fR, or may be a \fBblktrace\fR binary file.
+.B iops_slope
+Collect IOPS data and calculate the least squares regression
+slope. Stop the job if the slope falls below the specified limit.
 .TP
-.BI replay_no_stall \fR=\fPint
-While replaying I/O patterns using \fBread_iolog\fR the default behavior
-attempts to respect timing information between I/Os.  Enabling
-\fBreplay_no_stall\fR causes I/Os to be replayed as fast as possible while
-still respecting ordering.
+.B bw
+Collect bandwidth data. Stop the job if all individual bandwidth
+measurements are within the specified limit of the mean bandwidth.
 .TP
-.BI replay_redirect \fR=\fPstr
-While replaying I/O patterns using \fBread_iolog\fR the default behavior
-is to replay the IOPS onto the major/minor device that each IOP was recorded
-from.  Setting \fBreplay_redirect\fR causes all IOPS to be replayed onto the
-single specified device regardless of the device it was recorded from.
+.B bw_slope
+Collect bandwidth data and calculate the least squares regression
+slope. Stop the job if the slope falls below the specified limit.
+.RE
+.RE
 .TP
-.BI replay_align \fR=\fPint
-Force alignment of IO offsets and lengths in a trace to this power of 2 value.
+.BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime
+A rolling window of this duration will be used to judge whether steady state
+has been reached. Data will be collected once per second. The default is 0
+which disables steady state detection. When the unit is omitted, the
+value is interpreted in seconds.
 .TP
-.BI replay_scale \fR=\fPint
-Scale sector offsets down by this factor when replaying traces.
+.BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime
+Allow the job to run for the specified duration before beginning data
+collection for checking the steady state job termination criterion. The
+default is 0. When the unit is omitted, the value is interpreted in seconds.
+.SS "Measurements and reporting"
 .TP
 .BI per_job_logs \fR=\fPbool
 If set, this generates bw/clat/iops log with per file private filenames. If
-not set, jobs with identical names will share the log filename. Default: true.
+not set, jobs with identical names will share the log filename. Default:
+true.
+.TP
+.BI group_reporting
+It may sometimes be interesting to display statistics for groups of jobs as
+a whole instead of for each individual job. This is especially true if
+\fBnumjobs\fR is used; looking at individual thread/process output
+quickly becomes unwieldy. To see the final report per\-group instead of
+per\-job, use \fBgroup_reporting\fR. Jobs in a file will be part of the
+same reporting group, unless if separated by a \fBstonewall\fR, or by
+using \fBnew_group\fR.
+.TP
+.BI new_group
+Start a new reporting group. See: \fBgroup_reporting\fR. If not given,
+all jobs in a file will be part of the same reporting group, unless
+separated by a \fBstonewall\fR.
+.TP
+.BI stats \fR=\fPbool
+By default, fio collects and shows final output results for all jobs
+that run. If this option is set to 0, then fio will ignore it in
+the final stat output.
 .TP
 .BI write_bw_log \fR=\fPstr
-If given, write a bandwidth log for this job. Can be used to store data of the
-bandwidth of the jobs in their lifetime. The included fio_generate_plots script
-uses gnuplot to turn these text files into nice graphs. See \fBwrite_lat_log\fR
-for behaviour of given filename. For this option, the postfix is _bw.x.log,
-where x is the index of the job (1..N, where N is the number of jobs). If
-\fBper_job_logs\fR is false, then the filename will not include the job index.
-See the \fBLOG FILE FORMATS\fR
-section.
+If given, write a bandwidth log for this job. Can be used to store data of
+the bandwidth of the jobs in their lifetime. The included
+\fBfio_generate_plots\fR script uses gnuplot to turn these
+text files into nice graphs. See \fBwrite_lat_log\fR for behavior of
+given filename. For this option, the postfix is `_bw.x.log', where `x'
+is the index of the job (1..N, where N is the number of jobs). If
+\fBper_job_logs\fR is false, then the filename will not include the job
+index. See \fBLOG FILE FORMATS\fR section.
 .TP
 .BI write_lat_log \fR=\fPstr
-Same as \fBwrite_bw_log\fR, but writes I/O completion latencies.  If no
-filename is given with this option, the default filename of
-"jobname_type.x.log" is used, where x is the index of the job (1..N, where
-N is the number of jobs). Even if the filename is given, fio will still
-append the type of log. If \fBper_job_logs\fR is false, then the filename will
-not include the job index. See the \fBLOG FILE FORMATS\fR section.
+Same as \fBwrite_bw_log\fR, except that this option stores I/O
+submission, completion, and total latencies instead. If no filename is given
+with this option, the default filename of `jobname_type.log' is
+used. Even if the filename is given, fio will still append the type of
+log. So if one specifies:
+.RS
+.RS
+.P
+write_lat_log=foo
+.RE
+.P
+The actual log names will be `foo_slat.x.log', `foo_clat.x.log',
+and `foo_lat.x.log', where `x' is the index of the job (1..N, where N
+is the number of jobs). This helps \fBfio_generate_plots\fR find the
+logs automatically. If \fBper_job_logs\fR is false, then the filename
+will not include the job index. See \fBLOG FILE FORMATS\fR section.
+.RE
 .TP
 .BI write_hist_log \fR=\fPstr
-Same as \fBwrite_lat_log\fR, but writes I/O completion latency histograms. If
-no filename is given with this option, the default filename of
-"jobname_clat_hist.x.log" is used, where x is the index of the job (1..N, where
-N is the number of jobs). Even if the filename is given, fio will still append
-the type of log. If \fBper_job_logs\fR is false, then the filename will not
-include the job index. See the \fBLOG FILE FORMATS\fR section.
+Same as \fBwrite_lat_log\fR, but writes I/O completion latency
+histograms. If no filename is given with this option, the default filename
+of `jobname_clat_hist.x.log' is used, where `x' is the index of the
+job (1..N, where N is the number of jobs). Even if the filename is given,
+fio will still append the type of log. If \fBper_job_logs\fR is false,
+then the filename will not include the job index. See \fBLOG FILE FORMATS\fR section.
 .TP
 .BI write_iops_log \fR=\fPstr
-Same as \fBwrite_bw_log\fR, but writes IOPS. If no filename is given with this
-option, the default filename of "jobname_type.x.log" is used, where x is the
-index of the job (1..N, where N is the number of jobs). Even if the filename
-is given, fio will still append the type of log. If \fBper_job_logs\fR is false,
-then the filename will not include the job index. See the \fBLOG FILE FORMATS\fR
-section.
+Same as \fBwrite_bw_log\fR, but writes IOPS. If no filename is given
+with this option, the default filename of `jobname_type.x.log' is
+used, where `x' is the index of the job (1..N, where N is the number of
+jobs). Even if the filename is given, fio will still append the type of
+log. If \fBper_job_logs\fR is false, then the filename will not include
+the job index. See \fBLOG FILE FORMATS\fR section.
 .TP
 .BI log_avg_msec \fR=\fPint
 By default, fio will log an entry in the iops, latency, or bw log for every
-IO that completes. When writing to the disk log, that can quickly grow to a
+I/O that completes. When writing to the disk log, that can quickly grow to a
 very large size. Setting this option makes fio average the each log entry
 over the specified period of time, reducing the resolution of the log. See
-\fBlog_max_value\fR as well.  Defaults to 0, logging all entries.
-.TP
-.BI log_max_value \fR=\fPbool
-If \fBlog_avg_msec\fR is set, fio logs the average over that window. If you
-instead want to log the maximum value, set this option to 1.  Defaults to
-0, meaning that averaged values are logged.
+\fBlog_max_value\fR as well. Defaults to 0, logging all entries.
+Also see \fBLOG FILE FORMATS\fR section.
 .TP
 .BI log_hist_msec \fR=\fPint
-Same as \fBlog_avg_msec\fR, but logs entries for completion latency histograms.
-Computing latency percentiles from averages of intervals using \fBlog_avg_msec\fR
-is innacurate. Setting this option makes fio log histogram entries over the
-specified period of time, reducing log sizes for high IOPS devices while
-retaining percentile accuracy. See \fBlog_hist_coarseness\fR as well. Defaults
-to 0, meaning histogram logging is disabled.
+Same as \fBlog_avg_msec\fR, but logs entries for completion latency
+histograms. Computing latency percentiles from averages of intervals using
+\fBlog_avg_msec\fR is inaccurate. Setting this option makes fio log
+histogram entries over the specified period of time, reducing log sizes for
+high IOPS devices while retaining percentile accuracy. See
+\fBlog_hist_coarseness\fR as well. Defaults to 0, meaning histogram
+logging is disabled.
 .TP
 .BI log_hist_coarseness \fR=\fPint
-Integer ranging from 0 to 6, defining the coarseness of the resolution of the
-histogram logs enabled with \fBlog_hist_msec\fR. For each increment in
-coarseness, fio outputs half as many bins. Defaults to 0, for which histogram
-logs contain 1216 latency bins. See the \fBLOG FILE FORMATS\fR section.
+Integer ranging from 0 to 6, defining the coarseness of the resolution of
+the histogram logs enabled with \fBlog_hist_msec\fR. For each increment
+in coarseness, fio outputs half as many bins. Defaults to 0, for which
+histogram logs contain 1216 latency bins. See \fBLOG FILE FORMATS\fR section.
+.TP
+.BI log_max_value \fR=\fPbool
+If \fBlog_avg_msec\fR is set, fio logs the average over that window. If
+you instead want to log the maximum value, set this option to 1. Defaults to
+0, meaning that averaged values are logged.
 .TP
 .BI log_offset \fR=\fPbool
-If this is set, the iolog options will include the byte offset for the IO
-entry as well as the other data values.
+If this is set, the iolog options will include the byte offset for the I/O
+entry as well as the other data values. Defaults to 0 meaning that
+offsets are not present in logs. Also see \fBLOG FILE FORMATS\fR section.
 .TP
 .BI log_compression \fR=\fPint
-If this is set, fio will compress the IO logs as it goes, to keep the memory
-footprint lower. When a log reaches the specified size, that chunk is removed
-and compressed in the background. Given that IO logs are fairly highly
-compressible, this yields a nice memory savings for longer runs. The downside
-is that the compression will consume some background CPU cycles, so it may
-impact the run. This, however, is also true if the logging ends up consuming
-most of the system memory. So pick your poison. The IO logs are saved
-normally at the end of a run, by decompressing the chunks and storing them
-in the specified log file. This feature depends on the availability of zlib.
+If this is set, fio will compress the I/O logs as it goes, to keep the
+memory footprint lower. When a log reaches the specified size, that chunk is
+removed and compressed in the background. Given that I/O logs are fairly
+highly compressible, this yields a nice memory savings for longer runs. The
+downside is that the compression will consume some background CPU cycles, so
+it may impact the run. This, however, is also true if the logging ends up
+consuming most of the system memory. So pick your poison. The I/O logs are
+saved normally at the end of a run, by decompressing the chunks and storing
+them in the specified log file. This feature depends on the availability of
+zlib.
 .TP
 .BI log_compression_cpus \fR=\fPstr
-Define the set of CPUs that are allowed to handle online log compression
-for the IO jobs. This can provide better isolation between performance
+Define the set of CPUs that are allowed to handle online log compression for
+the I/O jobs. This can provide better isolation between performance
 sensitive jobs, and background compression work.
 .TP
 .BI log_store_compressed \fR=\fPbool
 If set, fio will store the log files in a compressed format. They can be
-decompressed with fio, using the \fB\-\-inflate-log\fR command line parameter.
-The files will be stored with a \fB\.fz\fR suffix.
+decompressed with fio, using the \fB\-\-inflate\-log\fR command line
+parameter. The files will be stored with a `.fz' suffix.
 .TP
 .BI log_unix_epoch \fR=\fPbool
 If set, fio will log Unix timestamps to the log files produced by enabling
-\fBwrite_type_log\fR for each log type, instead of the default zero-based
+write_type_log for each log type, instead of the default zero\-based
 timestamps.
 .TP
 .BI block_error_percentiles \fR=\fPbool
-If set, record errors in trim block-sized units from writes and trims and output
-a histogram of how many trims it took to get to errors, and what kind of error
-was encountered.
+If set, record errors in trim block\-sized units from writes and trims and
+output a histogram of how many trims it took to get to errors, and what kind
+of error was encountered.
+.TP
+.BI bwavgtime \fR=\fPint
+Average the calculated bandwidth over the given time. Value is specified in
+milliseconds. If the job also does bandwidth logging through
+\fBwrite_bw_log\fR, then the minimum of this option and
+\fBlog_avg_msec\fR will be used. Default: 500ms.
+.TP
+.BI iopsavgtime \fR=\fPint
+Average the calculated IOPS over the given time. Value is specified in
+milliseconds. If the job also does IOPS logging through
+\fBwrite_iops_log\fR, then the minimum of this option and
+\fBlog_avg_msec\fR will be used. Default: 500ms.
+.TP
+.BI disk_util \fR=\fPbool
+Generate disk utilization statistics, if the platform supports it.
+Default: true.
 .TP
 .BI disable_lat \fR=\fPbool
-Disable measurements of total latency numbers. Useful only for cutting
-back the number of calls to \fBgettimeofday\fR\|(2), as that does impact performance at
-really high IOPS rates.  Note that to really get rid of a large amount of these
-calls, this option must be used with disable_slat and disable_bw as well.
+Disable measurements of total latency numbers. Useful only for cutting back
+the number of calls to \fBgettimeofday\fR\|(2), as that does impact
+performance at really high IOPS rates. Note that to really get rid of a
+large amount of these calls, this option must be used with
+\fBdisable_slat\fR and \fBdisable_bw_measurement\fR as well.
 .TP
 .BI disable_clat \fR=\fPbool
-Disable measurements of completion latency numbers. See \fBdisable_lat\fR.
+Disable measurements of completion latency numbers. See
+\fBdisable_lat\fR.
 .TP
 .BI disable_slat \fR=\fPbool
-Disable measurements of submission latency numbers. See \fBdisable_lat\fR.
+Disable measurements of submission latency numbers. See
+\fBdisable_lat\fR.
 .TP
-.BI disable_bw_measurement \fR=\fPbool
-Disable measurements of throughput/bandwidth numbers. See \fBdisable_lat\fR.
+.BI disable_bw_measurement \fR=\fPbool "\fR,\fP disable_bw" \fR=\fPbool
+Disable measurements of throughput/bandwidth numbers. See
+\fBdisable_lat\fR.
 .TP
-.BI lockmem \fR=\fPint
-Pin the specified amount of memory with \fBmlock\fR\|(2).  Can be used to
-simulate a smaller amount of memory. The amount specified is per worker.
+.BI clat_percentiles \fR=\fPbool
+Enable the reporting of percentiles of completion latencies. This option is
+mutually exclusive with \fBlat_percentiles\fR.
 .TP
-.BI exec_prerun \fR=\fPstr
-Before running the job, execute the specified command with \fBsystem\fR\|(3).
-.RS
-Output is redirected in a file called \fBjobname.prerun.txt\fR
-.RE
+.BI lat_percentiles \fR=\fPbool
+Enable the reporting of percentiles of IO latencies. This is similar to
+\fBclat_percentiles\fR, except that this includes the submission latency.
+This option is mutually exclusive with \fBclat_percentiles\fR.
 .TP
-.BI exec_postrun \fR=\fPstr
-Same as \fBexec_prerun\fR, but the command is executed after the job completes.
+.BI percentile_list \fR=\fPfloat_list
+Overwrite the default list of percentiles for completion latencies and the
+block error histogram. Each number is a floating number in the range
+(0,100], and the maximum length of the list is 20. Use ':' to separate the
+numbers, and list the numbers in ascending order. For example,
+`\-\-percentile_list=99.5:99.9' will cause fio to report the values of
+completion latency below which 99.5% and 99.9% of the observed latencies
+fell, respectively.
+.SS "Error handling"
+.TP
+.BI exitall_on_error
+When one job finishes in error, terminate the rest. The default is to wait
+for each job to finish.
+.TP
+.BI continue_on_error \fR=\fPstr
+Normally fio will exit the job on the first observed failure. If this option
+is set, fio will continue the job when there is a 'non\-fatal error' (EIO or
+EILSEQ) until the runtime is exceeded or the I/O size specified is
+completed. If this option is used, there are two more stats that are
+appended, the total error count and the first error. The error field given
+in the stats is the first error that was hit during the run.
+The allowed values are:
+.RS
 .RS
-Output is redirected in a file called \fBjobname.postrun.txt\fR
-.RE
-.TP
-.BI ioscheduler \fR=\fPstr
-Attempt to switch the device hosting the file to the specified I/O scheduler.
 .TP
-.BI disk_util \fR=\fPbool
-Generate disk utilization statistics if the platform supports it. Default: true.
+.B none
+Exit on any I/O or verify errors.
 .TP
-.BI clocksource \fR=\fPstr
-Use the given clocksource as the base of timing. The supported options are:
-.RS
+.B read
+Continue on read errors, exit on all others.
 .TP
-.B gettimeofday
-\fBgettimeofday\fR\|(2)
+.B write
+Continue on write errors, exit on all others.
 .TP
-.B clock_gettime
-\fBclock_gettime\fR\|(2)
+.B io
+Continue on any I/O error, exit on all others.
 .TP
-.B cpu
-Internal CPU clock source
+.B verify
+Continue on verify errors, exit on all others.
 .TP
-.RE
-.P
-\fBcpu\fR is the preferred clocksource if it is reliable, as it is very fast
-(and fio is heavy on time calls). Fio will automatically use this clocksource
-if it's supported and considered reliable on the system it is running on,
-unless another clocksource is specifically set. For x86/x86-64 CPUs, this
-means supporting TSC Invariant.
+.B all
+Continue on all errors.
 .TP
-.BI gtod_reduce \fR=\fPbool
-Enable all of the \fBgettimeofday\fR\|(2) reducing options (disable_clat, disable_slat,
-disable_bw) plus reduce precision of the timeout somewhat to really shrink the
-\fBgettimeofday\fR\|(2) call count. With this option enabled, we only do about 0.4% of
-the gtod() calls we would have done if all time keeping was enabled.
+.B 0
+Backward\-compatible alias for 'none'.
 .TP
-.BI gtod_cpu \fR=\fPint
-Sometimes it's cheaper to dedicate a single thread of execution to just getting
-the current time. Fio (and databases, for instance) are very intensive on
-\fBgettimeofday\fR\|(2) calls. With this option, you can set one CPU aside for doing
-nothing but logging current time to a shared memory location. Then the other
-threads/processes that run IO workloads need only copy that segment, instead of
-entering the kernel with a \fBgettimeofday\fR\|(2) call. The CPU set aside for doing
-these time calls will be excluded from other uses. Fio will manually clear it
-from the CPU mask of other jobs.
+.B 1
+Backward\-compatible alias for 'all'.
+.RE
+.RE
 .TP
 .BI ignore_error \fR=\fPstr
-Sometimes you want to ignore some errors during test in that case you can specify
-error list for each error type.
-.br
-ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST
-.br
-errors for given error type is separated with ':'.
-Error may be symbol ('ENOSPC', 'ENOMEM') or an integer.
-.br
-Example: ignore_error=EAGAIN,ENOSPC:122 .
-.br
-This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from WRITE.
+Sometimes you want to ignore some errors during test in that case you can
+specify error list for each error type, instead of only being able to
+ignore the default 'non\-fatal error' using \fBcontinue_on_error\fR.
+`ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST' errors for
+given error type is separated with ':'. Error may be symbol ('ENOSPC', 'ENOMEM')
+or integer. Example:
+.RS
+.RS
+.P
+ignore_error=EAGAIN,ENOSPC:122
+.RE
+.P
+This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
+WRITE. This option works by overriding \fBcontinue_on_error\fR with
+the list of errors for each error type if any.
+.RE
 .TP
 .BI error_dump \fR=\fPbool
-If set dump every error even if it is non fatal, true by default. If disabled
-only fatal error will be dumped
+If set dump every error even if it is non fatal, true by default. If
+disabled only fatal error will be dumped.
+.SS "Running predefined workloads"
+Fio includes predefined profiles that mimic the I/O workloads generated by
+other tools.
 .TP
 .BI profile \fR=\fPstr
-Select a specific builtin performance test.
-.TP
-.BI cgroup \fR=\fPstr
-Add job to this control group. If it doesn't exist, it will be created.
-The system must have a mounted cgroup blkio mount point for this to work. If
-your system doesn't have it mounted, you can do so with:
-
-# mount \-t cgroup \-o blkio none /cgroup
-.TP
-.BI cgroup_weight \fR=\fPint
-Set the weight of the cgroup to this value. See the documentation that comes
-with the kernel, allowed values are in the range of 100..1000.
-.TP
-.BI cgroup_nodelete \fR=\fPbool
-Normally fio will delete the cgroups it has created after the job completion.
-To override this behavior and to leave cgroups around after the job completion,
-set cgroup_nodelete=1. This can be useful if one wants to inspect various
-cgroup files after job completion. Default: false
-.TP
-.BI uid \fR=\fPint
-Instead of running as the invoking user, set the user ID to this value before
-the thread/process does any work.
-.TP
-.BI gid \fR=\fPint
-Set group ID, see \fBuid\fR.
-.TP
-.BI unit_base \fR=\fPint
-Base unit for reporting.  Allowed values are:
+The predefined workload to run. Current profiles are:
+.RS
 .RS
 .TP
-.B 0
-Use auto-detection (default).
-.TP
-.B 8
-Byte based.
+.B tiobench
+Threaded I/O bench (tiotest/tiobench) like workload.
 .TP
-.B 1
-Bit based.
+.B act
+Aerospike Certification Tool (ACT) like workload.
+.RE
 .RE
 .P
-.TP
-.BI flow_id \fR=\fPint
-The ID of the flow. If not specified, it defaults to being a global flow. See
-\fBflow\fR.
-.TP
-.BI flow \fR=\fPint
-Weight in token-based flow control. If this value is used, then there is a
-\fBflow counter\fR which is used to regulate the proportion of activity between
-two or more jobs. fio attempts to keep this flow counter near zero. The
-\fBflow\fR parameter stands for how much should be added or subtracted to the
-flow counter on each iteration of the main I/O loop. That is, if one job has
-\fBflow=8\fR and another job has \fBflow=-1\fR, then there will be a roughly
-1:8 ratio in how much one runs vs the other.
-.TP
-.BI flow_watermark \fR=\fPint
-The maximum value that the absolute value of the flow counter is allowed to
-reach before the job must wait for a lower value of the counter.
-.TP
-.BI flow_sleep \fR=\fPint
-The period of time, in microseconds, to wait after the flow watermark has been
-exceeded before retrying operations
-.TP
-.BI clat_percentiles \fR=\fPbool
-Enable the reporting of percentiles of completion latencies.
-.TP
-.BI percentile_list \fR=\fPfloat_list
-Overwrite the default list of percentiles for completion latencies and the
-block error histogram. Each number is a floating number in the range (0,100],
-and the maximum length of the list is 20. Use ':' to separate the
-numbers. For example, \-\-percentile_list=99.5:99.9 will cause fio to
-report the values of completion latency below which 99.5% and 99.9% of
-the observed latencies fell, respectively.
-.SS "Ioengine Parameters List"
-Some parameters are only valid when a specific ioengine is in use. These are
-used identically to normal parameters, with the caveat that when used on the
-command line, they must come after the ioengine.
-.TP
-.BI (cpuio)cpuload \fR=\fPint
-Attempt to use the specified percentage of CPU cycles.
-.TP
-.BI (cpuio)cpuchunks \fR=\fPint
-Split the load into cycles of the given time. In microseconds.
-.TP
-.BI (cpuio)exit_on_io_done \fR=\fPbool
-Detect when IO threads are done, then exit.
-.TP
-.BI (libaio)userspace_reap
-Normally, with the libaio engine in use, fio will use
-the io_getevents system call to reap newly returned events.
-With this flag turned on, the AIO ring will be read directly
-from user-space to reap events. The reaping mode is only
-enabled when polling for a minimum of 0 events (eg when
-iodepth_batch_complete=0).
-.TP
-.BI (pvsync2)hipri
-Set RWF_HIPRI on IO, indicating to the kernel that it's of
-higher priority than normal.
-.TP
-.BI (net,netsplice)hostname \fR=\fPstr
-The host name or IP address to use for TCP or UDP based IO.
-If the job is a TCP listener or UDP reader, the hostname is not
-used and must be omitted unless it is a valid UDP multicast address.
-.TP
-.BI (net,netsplice)port \fR=\fPint
-The TCP or UDP port to bind to or connect to. If this is used with
-\fBnumjobs\fR to spawn multiple instances of the same job type, then
-this will be the starting port number since fio will use a range of ports.
-.TP
-.BI (net,netsplice)interface \fR=\fPstr
-The IP address of the network interface used to send or receive UDP multicast
-packets.
-.TP
-.BI (net,netsplice)ttl \fR=\fPint
-Time-to-live value for outgoing UDP multicast packets. Default: 1
-.TP
-.BI (net,netsplice)nodelay \fR=\fPbool
-Set TCP_NODELAY on TCP connections.
-.TP
-.BI (net,netsplice)protocol \fR=\fPstr "\fR,\fP proto" \fR=\fPstr
-The network protocol to use. Accepted values are:
-.RS
+To view a profile's additional options use \fB\-\-cmdhelp\fR after specifying
+the profile. For example:
 .RS
 .TP
-.B tcp
-Transmission control protocol
-.TP
-.B tcpv6
-Transmission control protocol V6
+$ fio \-\-profile=act \-\-cmdhelp
+.RE
+.SS "Act profile options"
 .TP
-.B udp
-User datagram protocol
+.BI device\-names \fR=\fPstr
+Devices to use.
 .TP
-.B udpv6
-User datagram protocol V6
+.BI load \fR=\fPint
+ACT load multiplier. Default: 1.
 .TP
-.B unix
-UNIX domain socket
-.RE
-.P
-When the protocol is TCP or UDP, the port must also be given,
-as well as the hostname if the job is a TCP listener or UDP
-reader. For unix sockets, the normal filename option should be
-used and the port is invalid.
-.RE
-.TP
-.BI (net,netsplice)listen
-For TCP network connections, tell fio to listen for incoming
-connections rather than initiating an outgoing connection. The
-hostname must be omitted if this option is used.
-.TP
-.BI (net, pingpong) \fR=\fPbool
-Normally a network writer will just continue writing data, and a network reader
-will just consume packets. If pingpong=1 is set, a writer will send its normal
-payload to the reader, then wait for the reader to send the same payload back.
-This allows fio to measure network latencies. The submission and completion
-latencies then measure local time spent sending or receiving, and the
-completion latency measures how long it took for the other end to receive and
-send back. For UDP multicast traffic pingpong=1 should only be set for a single
-reader when multiple readers are listening to the same address.
+.BI test\-duration\fR=\fPtime
+How long the entire test takes to run. When the unit is omitted, the value
+is given in seconds. Default: 24h.
 .TP
-.BI (net, window_size) \fR=\fPint
-Set the desired socket buffer size for the connection.
+.BI threads\-per\-queue\fR=\fPint
+Number of read I/O threads per device. Default: 8.
 .TP
-.BI (net, mss) \fR=\fPint
-Set the TCP maximum segment size (TCP_MAXSEG).
+.BI read\-req\-num\-512\-blocks\fR=\fPint
+Number of 512B blocks to read at the time. Default: 3.
 .TP
-.BI (e4defrag,donorname) \fR=\fPstr
-File will be used as a block donor (swap extents between files)
+.BI large\-block\-op\-kbytes\fR=\fPint
+Size of large block ops in KiB (writes). Default: 131072.
 .TP
-.BI (e4defrag,inplace) \fR=\fPint
-Configure donor file block allocation strategy
-.RS
-.BI 0(default) :
-Preallocate donor's file on init
+.BI prep
+Set to run ACT prep phase.
+.SS "Tiobench profile options"
 .TP
-.BI 1:
-allocate space immediately inside defragment event, and free right after event
-.RE
-.TP 
-.BI (rbd)clustername \fR=\fPstr
-Specifies the name of the ceph cluster.
+.BI size\fR=\fPstr
+Size in MiB.
 .TP
-.BI (rbd)rbdname \fR=\fPstr
-Specifies the name of the RBD.
+.BI block\fR=\fPint
+Block size in bytes. Default: 4096.
 .TP
-.BI (rbd)pool \fR=\fPstr
-Specifies the name of the Ceph pool containing the RBD.
+.BI numruns\fR=\fPint
+Number of runs.
 .TP
-.BI (rbd)clientname \fR=\fPstr
-Specifies the username (without the 'client.' prefix) used to access the Ceph
-cluster. If the clustername is specified, the clientname shall be the full
-type.id string. If no type. prefix is given, fio will add 'client.' by default.
+.BI dir\fR=\fPstr
+Test directory.
 .TP
-.BI (mtd)skipbad \fR=\fPbool
-Skip operations against known bad blocks.
+.BI threads\fR=\fPint
+Number of threads.
 .SH OUTPUT
-While running, \fBfio\fR will display the status of the created jobs.  For
-example:
-.RS
-.P
-Threads: 1: [_r] [24.8% done] [ 13509/  8334 kb/s] [eta 00h:01m:31s]
-.RE
+Fio spits out a lot of output. While running, fio will display the status of the
+jobs created. An example of that would be:
 .P
-The characters in the first set of brackets denote the current status of each
-threads.  The possible values are:
-.P
-.PD 0
+.nf
+		Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
+.fi
+.P
+The characters inside the first set of square brackets denote the current status of
+each thread. The first character is the first job defined in the job file, and so
+forth. The possible values (in typical life cycle order) are:
 .RS
 .TP
+.PD 0
 .B P
-Setup but not started.
+Thread setup, but not started.
 .TP
 .B C
 Thread created.
 .TP
 .B I
-Initialized, waiting.
+Thread initialized, waiting or generating necessary data.
+.TP
+.B P
+Thread running pre\-reading file(s).
+.TP
+.B /
+Thread is in ramp period.
 .TP
 .B R
 Running, doing sequential reads.
@@ -1949,563 +2734,759 @@
 .B m
 Running, doing mixed random reads/writes.
 .TP
+.B D
+Running, doing sequential trims.
+.TP
+.B d
+Running, doing random trims.
+.TP
 .B F
 Running, currently waiting for \fBfsync\fR\|(2).
 .TP
 .B V
-Running, verifying written data.
+Running, doing verification of written data.
+.TP
+.B f
+Thread finishing.
 .TP
 .B E
-Exited, not reaped by main thread.
+Thread exited, not reaped by main thread yet.
 .TP
 .B \-
-Exited, thread reaped.
-.RE
-.PD
-.P
-The second set of brackets shows the estimated completion percentage of
-the current group.  The third set shows the read and write I/O rate,
-respectively. Finally, the estimated run time of the job is displayed.
-.P
-When \fBfio\fR completes (or is interrupted by Ctrl-C), it will show data
-for each thread, each group of threads, and each disk, in that order.
-.P
-Per-thread statistics first show the threads client number, group-id, and
-error code.  The remaining figures are as follows:
-.RS
-.TP
-.B io
-Number of megabytes of I/O performed.
+Thread reaped.
 .TP
-.B bw
-Average data rate (bandwidth).
+.B X
+Thread reaped, exited with an error.
 .TP
-.B runt
-Threads run time.
+.B K
+Thread reaped, exited due to signal.
+.PD
+.RE
+.P
+Fio will condense the thread string as not to take up more space on the command
+line than needed. For instance, if you have 10 readers and 10 writers running,
+the output would look like this:
+.P
+.nf
+		Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
+.fi
+.P
+Note that the status string is displayed in order, so it's possible to tell which of
+the jobs are currently doing what. In the example above this means that jobs 1\-\-10
+are readers and 11\-\-20 are writers.
+.P
+The other values are fairly self explanatory \-\- number of threads currently
+running and doing I/O, the number of currently open files (f=), the estimated
+completion percentage, the rate of I/O since last check (read speed listed first,
+then write speed and optionally trim speed) in terms of bandwidth and IOPS,
+and time to completion for the current running group. It's impossible to estimate
+runtime of the following groups (if any).
+.P
+When fio is done (or interrupted by Ctrl\-C), it will show the data for
+each thread, group of threads, and disks in that order. For each overall thread (or
+group) the output looks like:
+.P
+.nf
+		Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
+		  write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
+		    slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
+		    clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
+		     lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
+		    clat percentiles (usec):
+		     |  1.00th=[  302],  5.00th=[  326], 10.00th=[  343], 20.00th=[  363],
+		     | 30.00th=[  392], 40.00th=[  404], 50.00th=[  416], 60.00th=[  445],
+		     | 70.00th=[  816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
+		     | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
+		     | 99.99th=[78119]
+		   bw (  KiB/s): min=  532, max=  686, per=0.10%, avg=622.87, stdev=24.82, samples=  100
+		   iops        : min=   76, max=   98, avg=88.98, stdev= 3.54, samples=  100
+		  lat (usec)   : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
+		  lat (msec)   : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
+		  lat (msec)   : 100=0.65%
+		  cpu          : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
+		  IO depths    : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
+		     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+		     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+		     issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
+		     latency   : target=0, window=0, percentile=100.00%, depth=8
+.fi
+.P
+The job name (or first job's name when using \fBgroup_reporting\fR) is printed,
+along with the group id, count of jobs being aggregated, last error id seen (which
+is 0 when there are no errors), pid/tid of that thread and the time the job/group
+completed. Below are the I/O statistics for each data direction performed (showing
+writes in the example above). In the order listed, they denote:
+.RS
+.TP
+.B read/write/trim
+The string before the colon shows the I/O direction the statistics
+are for. \fIIOPS\fR is the average I/Os performed per second. \fIBW\fR
+is the average bandwidth rate shown as: value in power of 2 format
+(value in power of 10 format). The last two values show: (total
+I/O performed in power of 2 format / \fIruntime\fR of that thread).
 .TP
 .B slat
-Submission latency minimum, maximum, average and standard deviation. This is
-the time it took to submit the I/O.
+Submission latency (\fImin\fR being the minimum, \fImax\fR being the
+maximum, \fIavg\fR being the average, \fIstdev\fR being the standard
+deviation). This is the time it took to submit the I/O. For
+sync I/O this row is not displayed as the slat is really the
+completion latency (since queue/complete is one operation there).
+This value can be in nanoseconds, microseconds or milliseconds \-\-\-
+fio will choose the most appropriate base and print that (in the
+example above nanoseconds was the best scale). Note: in \fB\-\-minimal\fR mode
+latencies are always expressed in microseconds.
 .TP
 .B clat
-Completion latency minimum, maximum, average and standard deviation.  This
-is the time between submission and completion.
+Completion latency. Same names as slat, this denotes the time from
+submission to completion of the I/O pieces. For sync I/O, clat will
+usually be equal (or very close) to 0, as the time from submit to
+complete is basically just CPU time (I/O has already been done, see slat
+explanation).
+.TP
+.B lat
+Total latency. Same names as slat and clat, this denotes the time from
+when fio created the I/O unit to completion of the I/O operation.
 .TP
 .B bw
-Bandwidth minimum, maximum, percentage of aggregate bandwidth received, average
-and standard deviation.
+Bandwidth statistics based on samples. Same names as the xlat stats,
+but also includes the number of samples taken (\fIsamples\fR) and an
+approximate percentage of total aggregate bandwidth this thread
+received in its group (\fIper\fR). This last value is only really
+useful if the threads in this group are on the same disk, since they
+are then competing for disk access.
+.TP
+.B iops
+IOPS statistics based on samples. Same names as \fBbw\fR.
+.TP
+.B lat (nsec/usec/msec)
+The distribution of I/O completion latencies. This is the time from when
+I/O leaves fio and when it gets completed. Unlike the separate
+read/write/trim sections above, the data here and in the remaining
+sections apply to all I/Os for the reporting group. 250=0.04% means that
+0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
+of the I/Os required 250 to 499us for completion.
 .TP
 .B cpu
-CPU usage statistics. Includes user and system time, number of context switches
-this thread went through and number of major and minor page faults. The CPU
-utilization numbers are averages for the jobs in that reporting group, while
-the context and fault counters are summed.
+CPU usage. User and system time, along with the number of context
+switches this thread went through, usage of system and user time, and
+finally the number of major and minor page faults. The CPU utilization
+numbers are averages for the jobs in that reporting group, while the
+context and fault counters are summed.
 .TP
 .B IO depths
-Distribution of I/O depths.  Each depth includes everything less than (or equal)
-to it, but greater than the previous depth.
-.TP
-.B IO issued
-Number of read/write requests issued, and number of short read/write requests.
-.TP
-.B IO latencies
-Distribution of I/O completion latencies.  The numbers follow the same pattern
-as \fBIO depths\fR.
-.RE
+The distribution of I/O depths over the job lifetime. The numbers are
+divided into powers of 2 and each entry covers depths from that value
+up to those that are lower than the next entry \-\- e.g., 16= covers
+depths from 16 to 31. Note that the range covered by a depth
+distribution entry can be different to the range covered by the
+equivalent \fBsubmit\fR/\fBcomplete\fR distribution entry.
+.TP
+.B IO submit
+How many pieces of I/O were submitting in a single submit call. Each
+entry denotes that amount and below, until the previous entry \-\- e.g.,
+16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
+call. Note that the range covered by a \fBsubmit\fR distribution entry can
+be different to the range covered by the equivalent depth distribution
+entry.
+.TP
+.B IO complete
+Like the above \fBsubmit\fR number, but for completions instead.
+.TP
+.B IO issued rwt
+The number of \fBread/write/trim\fR requests issued, and how many of them were
+short or dropped.
+.TP
+.B IO latency
+These values are for \fBlatency-target\fR and related options. When
+these options are engaged, this section describes the I/O depth required
+to meet the specified latency target.
+.RE
+.P
+After each client has been listed, the group statistics are printed. They
+will look like this:
+.P
+.nf
+		Run status group 0 (all jobs):
+		   READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s\-10.8MiB/s (10.9MB/s\-11.3MB/s), io=64.0MiB (67.1MB), run=2973\-3069msec
+		  WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s\-621KiB/s (630kB/s\-636kB/s), io=64.0MiB (67.1MB), run=52747\-53223msec
+.fi
 .P
-The group statistics show:
-.PD 0
+For each data direction it prints:
 .RS
 .TP
-.B io
-Number of megabytes I/O performed.
-.TP
-.B aggrb
-Aggregate bandwidth of threads in the group.
-.TP
-.B minb
-Minimum average bandwidth a thread saw.
-.TP
-.B maxb
-Maximum average bandwidth a thread saw.
+.B bw
+Aggregate bandwidth of threads in this group followed by the
+minimum and maximum bandwidth of all the threads in this group.
+Values outside of brackets are power\-of\-2 format and those
+within are the equivalent value in a power\-of\-10 format.
 .TP
-.B mint
-Shortest runtime of threads in the group.
+.B io
+Aggregate I/O performed of all threads in this group. The
+format is the same as \fBbw\fR.
 .TP
-.B maxt
-Longest runtime of threads in the group.
+.B run
+The smallest and longest runtimes of the threads in this group.
 .RE
-.PD
 .P
-Finally, disk statistics are printed with reads first:
-.PD 0
+And finally, the disk statistics are printed. This is Linux specific.
+They will look like this:
+.P
+.nf
+		  Disk stats (read/write):
+		    sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+.fi
+.P
+Each value is printed for both reads and writes, with reads first. The
+numbers denote:
 .RS
 .TP
 .B ios
 Number of I/Os performed by all groups.
 .TP
 .B merge
-Number of merges in the I/O scheduler.
+Number of merges performed by the I/O scheduler.
 .TP
 .B ticks
 Number of ticks we kept the disk busy.
 .TP
-.B io_queue
+.B in_queue
 Total time spent in the disk queue.
 .TP
 .B util
-Disk utilization.
+The disk utilization. A value of 100% means we kept the disk
+busy constantly, 50% would be a disk idling half of the time.
 .RE
-.PD
 .P
-It is also possible to get fio to dump the current output while it is
-running, without terminating the job. To do that, send fio the \fBUSR1\fR
-signal.
+It is also possible to get fio to dump the current output while it is running,
+without terminating the job. To do that, send fio the USR1 signal. You can
+also get regularly timed dumps by using the \fB\-\-status\-interval\fR
+parameter, or by creating a file in `/tmp' named
+`fio\-dump\-status'. If fio sees this file, it will unlink it and dump the
+current output status.
 .SH TERSE OUTPUT
-If the \fB\-\-minimal\fR / \fB\-\-append-terse\fR options are given, the
-results will be printed/appended in a semicolon-delimited format suitable for
-scripted use.
-A job description (if provided) follows on a new line.  Note that the first
-number in the line is the version number. If the output has to be changed
-for some reason, this number will be incremented by 1 to signify that
-change.  The fields are:
+For scripted usage where you typically want to generate tables or graphs of the
+results, fio can output the results in a semicolon separated format. The format
+is one long line of values, such as:
 .P
-.RS
-.B terse version, fio version, jobname, groupid, error
+.nf
+		2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
+		A description of this job goes here.
+.fi
 .P
-Read status:
-.RS
-.B Total I/O \fR(KB)\fP, bandwidth \fR(KB/s)\fP, IOPS, runtime \fR(ms)\fP
+The job description (if provided) follows on a second line.
 .P
-Submission latency:
-.RS
-.B min, max, mean, standard deviation
-.RE
-Completion latency:
-.RS
-.B min, max, mean, standard deviation
-.RE
-Completion latency percentiles (20 fields):
-.RS
-.B Xth percentile=usec
-.RE
-Total latency:
-.RS
-.B min, max, mean, standard deviation
-.RE
-Bandwidth:
-.RS
-.B min, max, aggregate percentage of total, mean, standard deviation
-.RE
-.RE
+To enable terse output, use the \fB\-\-minimal\fR or
+`\-\-output\-format=terse' command line options. The
+first value is the version of the terse output format. If the output has to be
+changed for some reason, this number will be incremented by 1 to signify that
+change.
 .P
-Write status:
-.RS
-.B Total I/O \fR(KB)\fP, bandwidth \fR(KB/s)\fP, IOPS, runtime \fR(ms)\fP
+Split up, the format is as follows (comments in brackets denote when a
+field was introduced or whether it's specific to some terse version):
 .P
-Submission latency:
+.nf
+			terse version, fio version [v3], jobname, groupid, error
+.fi
 .RS
-.B min, max, mean, standard deviation
+.P
+.B
+READ status:
 .RE
-Completion latency:
+.P
+.nf
+			Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+			Submission latency: min, max, mean, stdev (usec)
+			Completion latency: min, max, mean, stdev (usec)
+			Completion latency percentiles: 20 fields (see below)
+			Total latency: min, max, mean, stdev (usec)
+			Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+			IOPS [v5]: min, max, mean, stdev, number of samples
+.fi
 .RS
-.B min, max, mean, standard deviation
+.P
+.B
+WRITE status:
 .RE
-Completion latency percentiles (20 fields):
+.P
+.nf
+			Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+			Submission latency: min, max, mean, stdev (usec)
+			Completion latency: min, max, mean, stdev (usec)
+			Completion latency percentiles: 20 fields (see below)
+			Total latency: min, max, mean, stdev (usec)
+			Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+			IOPS [v5]: min, max, mean, stdev, number of samples
+.fi
 .RS
-.B Xth percentile=usec
+.P
+.B
+TRIM status [all but version 3]:
 .RE
-Total latency:
+.P
+.nf
+			Fields are similar to \fBREAD/WRITE\fR status.
+.fi
 .RS
-.B min, max, mean, standard deviation
+.P
+.B
+CPU usage:
 .RE
-Bandwidth:
+.P
+.nf
+			user, system, context switches, major faults, minor faults
+.fi
 .RS
-.B min, max, aggregate percentage of total, mean, standard deviation
-.RE
+.P
+.B
+I/O depths:
 .RE
 .P
-CPU usage:
+.nf
+			<=1, 2, 4, 8, 16, 32, >=64
+.fi
 .RS
-.B user, system, context switches, major page faults, minor page faults
+.P
+.B
+I/O latencies microseconds:
 .RE
 .P
-IO depth distribution:
+.nf
+			<=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+.fi
 .RS
-.B <=1, 2, 4, 8, 16, 32, >=64
+.P
+.B
+I/O latencies milliseconds:
 .RE
 .P
-IO latency distribution:
-.RS
-Microseconds:
+.nf
+			<=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+.fi
 .RS
-.B <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+.P
+.B
+Disk utilization [v3]:
 .RE
-Milliseconds:
+.P
+.nf
+			disk name, read ios, write ios, read merges, write merges, read ticks, write ticks, time spent in queue, disk utilization percentage
+.fi
 .RS
-.B <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
-.RE
+.P
+.B
+Additional Info (dependent on continue_on_error, default off):
 .RE
 .P
-Disk utilization (1 for each disk used):
+.nf
+			total # errors, first error code
+.fi
 .RS
-.B name, read ios, write ios, read merges, write merges, read ticks, write ticks, read in-queue time, write in-queue time, disk utilization percentage
+.P
+.B
+Additional Info (dependent on description being set):
 .RE
 .P
-Error Info (dependent on continue_on_error, default off):
+.nf
+			Text description
+.fi
+.P
+Completion latency percentiles can be a grouping of up to 20 sets, so for the
+terse output fio writes all of them. Each field will look like this:
+.P
+.nf
+		1.00%=6112
+.fi
+.P
+which is the Xth percentile, and the `usec' latency associated with it.
+.P
+For \fBDisk utilization\fR, all disks used by fio are shown. So for each disk there
+will be a disk utilization section.
+.P
+Below is a single line containing short names for each of the fields in the
+minimal output v3, separated by semicolons:
+.P
+.nf
+		terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+.fi
+.SH JSON OUTPUT
+The \fBjson\fR output format is intended to be both human readable and convenient
+for automated parsing. For the most part its sections mirror those of the
+\fBnormal\fR output. The \fBruntime\fR value is reported in msec and the \fBbw\fR value is
+reported in 1024 bytes per second units.
+.fi
+.SH JSON+ OUTPUT
+The \fBjson+\fR output format is identical to the \fBjson\fR output format except that it
+adds a full dump of the completion latency bins. Each \fBbins\fR object contains a
+set of (key, value) pairs where keys are latency durations and values count how
+many I/Os had completion latencies of the corresponding duration. For example,
+consider:
 .RS
-.B total # errors, first error code
-.RE
 .P
-.B text description (if provided in config - appears on newline)
+"bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
 .RE
+.P
+This data indicates that one I/O required 87,552ns to complete, two I/Os required
+100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
+.P
+Also included with fio is a Python script \fBfio_jsonplus_clat2csv\fR that takes
+json+ output and generates CSV\-formatted latency data suitable for plotting.
+.P
+The latency durations actually represent the midpoints of latency intervals.
+For details refer to `stat.h' in the fio source.
 .SH TRACE FILE FORMAT
-There are two trace file format that you can encounter. The older (v1) format
-is unsupported since version 1.20-rc3 (March 2008). It will still be described
+There are two trace file format that you can encounter. The older (v1) format is
+unsupported since version 1.20\-rc3 (March 2008). It will still be described
 below in case that you get an old trace and want to understand it.
-
-In any case the trace is a simple text file with a single action per line.
-
 .P
+In any case the trace is a simple text file with a single action per line.
+.TP
 .B Trace file format v1
+Each line represents a single I/O action in the following format:
 .RS
-Each line represents a single io action in the following format:
-
+.RS
+.P
 rw, offset, length
-
-where rw=0/1 for read/write, and the offset and length entries being in bytes.
-
-This format is not supported in Fio versions => 1.20-rc3.
-
 .RE
 .P
+where `rw=0/1' for read/write, and the `offset' and `length' entries being in bytes.
+.P
+This format is not supported in fio versions >= 1.20\-rc3.
+.RE
+.TP
 .B Trace file format v2
+The second version of the trace file format was added in fio version 1.17. It
+allows to access more then one file per trace and has a bigger set of possible
+file actions.
 .RS
-The second version of the trace file format was added in Fio version 1.17.
-It allows one to access more then one file per trace and has a bigger set of
-possible file actions.
-
+.P
 The first line of the trace file has to be:
-
-\fBfio version 2 iolog\fR
-
+.RS
+.P
+"fio version 2 iolog"
+.RE
+.P
 Following this can be lines in two different formats, which are described below.
+.P
+.B
 The file management format:
-
-\fBfilename action\fR
-
-The filename is given as an absolute path. The action can be one of these:
-
+.RS
+filename action
 .P
-.PD 0
+The `filename' is given as an absolute path. The `action' can be one of these:
 .RS
 .TP
 .B add
-Add the given filename to the trace
+Add the given `filename' to the trace.
 .TP
 .B open
-Open the file with the given filename. The filename has to have been previously
-added with the \fBadd\fR action.
+Open the file with the given `filename'. The `filename' has to have
+been added with the \fBadd\fR action before.
 .TP
 .B close
-Close the file with the given filename. The file must have previously been
-opened.
+Close the file with the given `filename'. The file has to have been
+\fBopen\fRed before.
+.RE
 .RE
-.PD
 .P
-
-The file io action format:
-
-\fBfilename action offset length\fR
-
-The filename is given as an absolute path, and has to have been added and opened
-before it can be used with this format. The offset and length are given in
-bytes. The action can be one of these:
-
+.B
+The file I/O action format:
+.RS
+filename action offset length
 .P
-.PD 0
+The `filename' is given as an absolute path, and has to have been \fBadd\fRed and
+\fBopen\fRed before it can be used with this format. The `offset' and `length' are
+given in bytes. The `action' can be one of these:
 .RS
 .TP
 .B wait
-Wait for 'offset' microseconds. Everything below 100 is discarded.  The time is
-relative to the previous wait statement.
+Wait for `offset' microseconds. Everything below 100 is discarded.
+The time is relative to the previous `wait' statement.
 .TP
 .B read
-Read \fBlength\fR bytes beginning from \fBoffset\fR
+Read `length' bytes beginning from `offset'.
 .TP
 .B write
-Write \fBlength\fR bytes beginning from \fBoffset\fR
+Write `length' bytes beginning from `offset'.
 .TP
 .B sync
-fsync() the file
+\fBfsync\fR\|(2) the file.
 .TP
 .B datasync
-fdatasync() the file
+\fBfdatasync\fR\|(2) the file.
 .TP
 .B trim
-trim the given file from the given \fBoffset\fR for \fBlength\fR bytes
+Trim the given file from the given `offset' for `length' bytes.
+.RE
 .RE
-.PD
-.P
-
 .SH CPU IDLENESS PROFILING
-In some cases, we want to understand CPU overhead in a test. For example,
-we test patches for the specific goodness of whether they reduce CPU usage.
-fio implements a balloon approach to create a thread per CPU that runs at
-idle priority, meaning that it only runs when nobody else needs the cpu.
-By measuring the amount of work completed by the thread, idleness of each
-CPU can be derived accordingly.
-
-An unit work is defined as touching a full page of unsigned characters. Mean
-and standard deviation of time to complete an unit work is reported in "unit
-work" section. Options can be chosen to report detailed percpu idleness or
-overall system idleness by aggregating percpu stats.
-
+In some cases, we want to understand CPU overhead in a test. For example, we
+test patches for the specific goodness of whether they reduce CPU usage.
+Fio implements a balloon approach to create a thread per CPU that runs at idle
+priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each CPU
+can be derived accordingly.
+.P
+An unit work is defined as touching a full page of unsigned characters. Mean and
+standard deviation of time to complete an unit work is reported in "unit work"
+section. Options can be chosen to report detailed percpu idleness or overall
+system idleness by aggregating percpu stats.
 .SH VERIFICATION AND TRIGGERS
-Fio is usually run in one of two ways, when data verification is done. The
-first is a normal write job of some sort with verify enabled. When the
-write phase has completed, fio switches to reads and verifies everything
-it wrote. The second model is running just the write phase, and then later
-on running the same job (but with reads instead of writes) to repeat the
-same IO patterns and verify the contents. Both of these methods depend
-on the write phase being completed, as fio otherwise has no idea how much
-data was written.
-
-With verification triggers, fio supports dumping the current write state
-to local files. Then a subsequent read verify workload can load this state
-and know exactly where to stop. This is useful for testing cases where
-power is cut to a server in a managed fashion, for instance.
-
+Fio is usually run in one of two ways, when data verification is done. The first
+is a normal write job of some sort with verify enabled. When the write phase has
+completed, fio switches to reads and verifies everything it wrote. The second
+model is running just the write phase, and then later on running the same job
+(but with reads instead of writes) to repeat the same I/O patterns and verify
+the contents. Both of these methods depend on the write phase being completed,
+as fio otherwise has no idea how much data was written.
+.P
+With verification triggers, fio supports dumping the current write state to
+local files. Then a subsequent read verify workload can load this state and know
+exactly where to stop. This is useful for testing cases where power is cut to a
+server in a managed fashion, for instance.
+.P
 A verification trigger consists of two things:
-
 .RS
-Storing the write state of each job
-.LP
-Executing a trigger command
+.P
+1) Storing the write state of each job.
+.P
+2) Executing a trigger command.
 .RE
-
-The write state is relatively small, on the order of hundreds of bytes
-to single kilobytes. It contains information on the number of completions
-done, the last X completions, etc.
-
-A trigger is invoked either through creation (\fBtouch\fR) of a specified
-file in the system, or through a timeout setting. If fio is run with
-\fB\-\-trigger\-file=/tmp/trigger-file\fR, then it will continually check for
-the existence of /tmp/trigger-file. When it sees this file, it will
-fire off the trigger (thus saving state, and executing the trigger
+.P
+The write state is relatively small, on the order of hundreds of bytes to single
+kilobytes. It contains information on the number of completions done, the last X
+completions, etc.
+.P
+A trigger is invoked either through creation ('touch') of a specified file in
+the system, or through a timeout setting. If fio is run with
+`\-\-trigger\-file=/tmp/trigger\-file', then it will continually
+check for the existence of `/tmp/trigger\-file'. When it sees this file, it
+will fire off the trigger (thus saving state, and executing the trigger
 command).
-
-For client/server runs, there's both a local and remote trigger. If
-fio is running as a server backend, it will send the job states back
-to the client for safe storage, then execute the remote trigger, if
-specified. If a local trigger is specified, the server will still send
-back the write state, but the client will then execute the trigger.
-
+.P
+For client/server runs, there's both a local and remote trigger. If fio is
+running as a server backend, it will send the job states back to the client for
+safe storage, then execute the remote trigger, if specified. If a local trigger
+is specified, the server will still send back the write state, but the client
+will then execute the trigger.
 .RE
 .P
 .B Verification trigger example
 .RS
-
-Lets say we want to run a powercut test on the remote machine 'server'.
-Our write workload is in write-test.fio. We want to cut power to 'server'
-at some point during the run, and we'll run this test from the safety
-or our local machine, 'localbox'. On the server, we'll start the fio
-backend normally:
-
-server# \fBfio \-\-server\fR
-
+Let's say we want to run a powercut test on the remote Linux machine 'server'.
+Our write workload is in `write\-test.fio'. We want to cut power to 'server' at
+some point during the run, and we'll run this test from the safety or our local
+machine, 'localbox'. On the server, we'll start the fio backend normally:
+.RS
+.P
+server# fio \-\-server
+.RE
+.P
 and on the client, we'll fire off the workload:
-
-localbox$ \fBfio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger-remote="bash \-c "echo b > /proc/sysrq-triger""\fR
-
-We set \fB/tmp/my-trigger\fR as the trigger file, and we tell fio to execute
-
-\fBecho b > /proc/sysrq-trigger\fR
-
-on the server once it has received the trigger and sent us the write
-state. This will work, but it's not \fIreally\fR cutting power to the server,
-it's merely abruptly rebooting it. If we have a remote way of cutting
-power to the server through IPMI or similar, we could do that through
-a local trigger command instead. Lets assume we have a script that does
-IPMI reboot of a given hostname, ipmi-reboot. On localbox, we could
-then have run fio with a local trigger instead:
-
-localbox$ \fBfio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger="ipmi-reboot server"\fR
-
-For this case, fio would wait for the server to send us the write state,
-then execute 'ipmi-reboot server' when that happened.
-
+.RS
+.P
+localbox$ fio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger\-remote="bash \-c "echo b > /proc/sysrq\-triger""
+.RE
+.P
+We set `/tmp/my\-trigger' as the trigger file, and we tell fio to execute:
+.RS
+.P
+echo b > /proc/sysrq\-trigger
+.RE
+.P
+on the server once it has received the trigger and sent us the write state. This
+will work, but it's not really cutting power to the server, it's merely
+abruptly rebooting it. If we have a remote way of cutting power to the server
+through IPMI or similar, we could do that through a local trigger command
+instead. Let's assume we have a script that does IPMI reboot of a given hostname,
+ipmi\-reboot. On localbox, we could then have run fio with a local trigger
+instead:
+.RS
+.P
+localbox$ fio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger="ipmi\-reboot server"
+.RE
+.P
+For this case, fio would wait for the server to send us the write state, then
+execute `ipmi\-reboot server' when that happened.
 .RE
 .P
 .B Loading verify state
 .RS
-To load store write state, read verification job file must contain
-the verify_state_load option. If that is set, fio will load the previously
+To load stored write state, a read verification job file must contain the
+\fBverify_state_load\fR option. If that is set, fio will load the previously
 stored state. For a local fio run this is done by loading the files directly,
-and on a client/server run, the server backend will ask the client to send
-the files over and load them from there.
-
+and on a client/server run, the server backend will ask the client to send the
+files over and load them from there.
 .RE
-
 .SH LOG FILE FORMATS
-
 Fio supports a variety of log file formats, for logging latencies, bandwidth,
 and IOPS. The logs share a common format, which looks like this:
-
-.B time (msec), value, data direction, offset
-
-Time for the log entry is always in milliseconds. The value logged depends
-on the type of log, it will be one of the following:
-
+.RS
 .P
-.PD 0
+time (msec), value, data direction, block size (bytes), offset (bytes)
+.RE
+.P
+`Time' for the log entry is always in milliseconds. The `value' logged depends
+on the type of log, it will be one of the following:
+.RS
 .TP
 .B Latency log
-Value is in latency in usecs
+Value is latency in nsecs
 .TP
 .B Bandwidth log
-Value is in KB/sec
+Value is in KiB/sec
 .TP
 .B IOPS log
-Value is in IOPS
-.PD
-.P
-
-Data direction is one of the following:
-
+Value is IOPS
+.RE
 .P
-.PD 0
+`Data direction' is one of the following:
+.RS
 .TP
 .B 0
-IO is a READ
+I/O is a READ
 .TP
 .B 1
-IO is a WRITE
+I/O is a WRITE
 .TP
 .B 2
-IO is a TRIM
-.PD
-.P
-
-The \fIoffset\fR is the offset, in bytes, from the start of the file, for that
-particular IO. The logging of the offset can be toggled with \fBlog_offset\fR.
-
-If windowed logging is enabled through \fBlog_avg_msec\fR, then fio doesn't log
-individual IOs. Instead of logs the average values over the specified
-period of time. Since \fIdata direction\fR and \fIoffset\fR are per-IO values,
-they aren't applicable if windowed logging is enabled. If windowed logging
-is enabled and \fBlog_max_value\fR is set, then fio logs maximum values in
-that window instead of averages.
-
-For histogram logging the logs look like this:
-
-.B time (msec), data direction, block-size, bin 0, bin 1, ..., bin 1215
-
-Where 'bin i' gives the frequency of IO requests with a latency falling in
-the i-th bin. See \fBlog_hist_coarseness\fR for logging fewer bins.
-
+I/O is a TRIM
 .RE
-
+.P
+The entry's `block size' is always in bytes. The `offset' is the offset, in bytes,
+from the start of the file, for that particular I/O. The logging of the offset can be
+toggled with \fBlog_offset\fR.
+.P
+Fio defaults to logging every individual I/O. When IOPS are logged for individual
+I/Os the `value' entry will always be 1. If windowed logging is enabled through
+\fBlog_avg_msec\fR, fio logs the average values over the specified period of time.
+If windowed logging is enabled and \fBlog_max_value\fR is set, then fio logs
+maximum values in that window instead of averages. Since `data direction', `block size'
+and `offset' are per\-I/O values, if windowed logging is enabled they
+aren't applicable and will be 0.
 .SH CLIENT / SERVER
-Normally you would run fio as a stand-alone application on the machine
-where the IO workload should be generated. However, it is also possible to
-run the frontend and backend of fio separately. This makes it possible to
-have a fio server running on the machine(s) where the IO workload should
-be running, while controlling it from another machine.
-
-To start the server, you would do:
-
-\fBfio \-\-server=args\fR
-
-on that machine, where args defines what fio listens to. The arguments
-are of the form 'type:hostname or IP:port'. 'type' is either 'ip' (or ip4)
-for TCP/IP v4, 'ip6' for TCP/IP v6, or 'sock' for a local unix domain
-socket. 'hostname' is either a hostname or IP address, and 'port' is the port to
-listen to (only valid for TCP/IP, not a local socket). Some examples:
-
+Normally fio is invoked as a stand\-alone application on the machine where the
+I/O workload should be generated. However, the backend and frontend of fio can
+be run separately i.e., the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled by a client on another machine.
+.P
+Start the server on the machine which has access to the storage DUT:
+.RS
+.P
+$ fio \-\-server=args
+.RE
+.P
+where `args' defines what fio listens to. The arguments are of the form
+`type,hostname' or `IP,port'. `type' is either `ip' (or ip4) for TCP/IP
+v4, `ip6' for TCP/IP v6, or `sock' for a local unix domain socket.
+`hostname' is either a hostname or IP address, and `port' is the port to listen
+to (only valid for TCP/IP, not a local socket). Some examples:
+.RS
+.TP
 1) \fBfio \-\-server\fR
-
-   Start a fio server, listening on all interfaces on the default port (8765).
-
+Start a fio server, listening on all interfaces on the default port (8765).
+.TP
 2) \fBfio \-\-server=ip:hostname,4444\fR
-
-   Start a fio server, listening on IP belonging to hostname and on port 4444.
-
+Start a fio server, listening on IP belonging to hostname and on port 4444.
+.TP
 3) \fBfio \-\-server=ip6:::1,4444\fR
-
-   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
-
+Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
+.TP
 4) \fBfio \-\-server=,4444\fR
-
-   Start a fio server, listening on all interfaces on port 4444.
-
+Start a fio server, listening on all interfaces on port 4444.
+.TP
 5) \fBfio \-\-server=1.2.3.4\fR
-
-   Start a fio server, listening on IP 1.2.3.4 on the default port.
-
+Start a fio server, listening on IP 1.2.3.4 on the default port.
+.TP
 6) \fBfio \-\-server=sock:/tmp/fio.sock\fR
-
-   Start a fio server, listening on the local socket /tmp/fio.sock.
-
-When a server is running, you can connect to it from a client. The client
-is run with:
-
-\fBfio \-\-local-args \-\-client=server \-\-remote-args <job file(s)>\fR
-
-where \-\-local-args are arguments that are local to the client where it is
-running, 'server' is the connect string, and \-\-remote-args and <job file(s)>
-are sent to the server. The 'server' string follows the same format as it
-does on the server side, to allow IP/hostname/socket and port strings.
-You can connect to multiple clients as well, to do that you could run:
-
-\fBfio \-\-client=server2 \-\-client=server2 <job file(s)>\fR
-
-If the job file is located on the fio server, then you can tell the server
-to load a local file as well. This is done by using \-\-remote-config:
-
-\fBfio \-\-client=server \-\-remote-config /path/to/file.fio\fR
-
-Then fio will open this local (to the server) job file instead
-of being passed one from the client.
-
+Start a fio server, listening on the local socket `/tmp/fio.sock'.
+.RE
+.P
+Once a server is running, a "client" can connect to the fio server with:
+.RS
+.P
+$ fio <local\-args> \-\-client=<server> <remote\-args> <job file(s)>
+.RE
+.P
+where `local\-args' are arguments for the client where it is running, `server'
+is the connect string, and `remote\-args' and `job file(s)' are sent to the
+server. The `server' string follows the same format as it does on the server
+side, to allow IP/hostname/socket and port strings.
+.P
+Fio can connect to multiple servers this way:
+.RS
+.P
+$ fio \-\-client=<server1> <job file(s)> \-\-client=<server2> <job file(s)>
+.RE
+.P
+If the job file is located on the fio server, then you can tell the server to
+load a local file as well. This is done by using \fB\-\-remote\-config\fR:
+.RS
+.P
+$ fio \-\-client=server \-\-remote\-config /path/to/file.fio
+.RE
+.P
+Then fio will open this local (to the server) job file instead of being passed
+one from the client.
+.P
 If you have many servers (example: 100 VMs/containers), you can input a pathname
-of a file containing host IPs/names as the parameter value for the \-\-client option.
-For example, here is an example "host.list" file containing 2 hostnames:
-
+of a file containing host IPs/names as the parameter value for the
+\fB\-\-client\fR option. For example, here is an example `host.list'
+file containing 2 hostnames:
+.RS
+.P
+.PD 0
 host1.your.dns.domain
-.br
+.P
 host2.your.dns.domain
-
+.PD
+.RE
+.P
 The fio command would then be:
-
-\fBfio \-\-client=host.list <job file>\fR
-
-In this mode, you cannot input server-specific parameters or job files, and all
+.RS
+.P
+$ fio \-\-client=host.list <job file(s)>
+.RE
+.P
+In this mode, you cannot input server\-specific parameters or job files \-\- all
 servers receive the same job file.
-
-In order to enable fio \-\-client runs utilizing a shared filesystem from multiple hosts,
-fio \-\-client now prepends the IP address of the server to the filename. For example,
-if fio is using directory /mnt/nfs/fio and is writing filename fileio.tmp,
-with a \-\-client hostfile
-containing two hostnames h1 and h2 with IP addresses 192.168.10.120 and 192.168.10.121, then
-fio will create two files:
-
+.P
+In order to let `fio \-\-client' runs use a shared filesystem from multiple
+hosts, `fio \-\-client' now prepends the IP address of the server to the
+filename. For example, if fio is using the directory `/mnt/nfs/fio' and is
+writing filename `fileio.tmp', with a \fB\-\-client\fR `hostfile'
+containing two hostnames `h1' and `h2' with IP addresses 192.168.10.120 and
+192.168.10.121, then fio will create two files:
+.RS
+.P
+.PD 0
 /mnt/nfs/fio/192.168.10.120.fileio.tmp
-.br
+.P
 /mnt/nfs/fio/192.168.10.121.fileio.tmp
-
+.PD
+.RE
 .SH AUTHORS
-
 .B fio
 was written by Jens Axboe <jens.axboe@oracle.com>,
 now Jens Axboe <axboe@fb.com>.
 .br
 This man page was written by Aaron Carroll <aaronc@cse.unsw.edu.au> based
 on documentation by Jens Axboe.
+.br
+This man page was rewritten by Tomohiro Kusumi <tkusumi@tuxera.com> based
+on documentation by Jens Axboe.
 .SH "REPORTING BUGS"
 Report bugs to the \fBfio\fR mailing list <fio@vger.kernel.org>.
-See \fBREADME\fR.
+.br
+See \fBREPORTING\-BUGS\fR.
+.P
+\fBREPORTING\-BUGS\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/REPORTING\-BUGS\fR
 .SH "SEE ALSO"
 For further documentation see \fBHOWTO\fR and \fBREADME\fR.
 .br
-Sample jobfiles are available in the \fBexamples\fR directory.
+Sample jobfiles are available in the `examples/' directory.
+.br
+These are typically located under `/usr/share/doc/fio'.
+.P
+\fBHOWTO\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/HOWTO\fR
+.br
+\fBREADME\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/README\fR
diff -Nru fio-2.16/fio.h fio-3.1/fio.h
--- fio-2.16/fio.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/fio.h	2017-09-28 10:23:20.000000000 +0000
@@ -25,7 +25,7 @@
 #include "debug.h"
 #include "file.h"
 #include "io_ddir.h"
-#include "ioengine.h"
+#include "ioengines.h"
 #include "iolog.h"
 #include "helpers.h"
 #include "options.h"
@@ -35,10 +35,12 @@
 #include "oslib/getopt.h"
 #include "lib/rand.h"
 #include "lib/rbtree.h"
+#include "lib/num2str.h"
 #include "client.h"
 #include "server.h"
 #include "stat.h"
 #include "flow.h"
+#include "io_u.h"
 #include "io_u_queue.h"
 #include "workqueue.h"
 #include "steadystate.h"
@@ -57,6 +59,10 @@
 #define MPOL_LOCAL MPOL_MAX
 #endif
 
+#ifdef CONFIG_CUDA
+#include <cuda.h>
+#endif
+
 /*
  * offset generator types
  */
@@ -74,17 +80,20 @@
 	TD_F_VER_NONE		= 1U << 5,
 	TD_F_PROFILE_OPS	= 1U << 6,
 	TD_F_COMPRESS		= 1U << 7,
-	TD_F_NOIO		= 1U << 8,
+	TD_F_RESERVED		= 1U << 8, /* not used */
 	TD_F_COMPRESS_LOG	= 1U << 9,
 	TD_F_VSTATE_SAVED	= 1U << 10,
 	TD_F_NEED_LOCK		= 1U << 11,
 	TD_F_CHILD		= 1U << 12,
 	TD_F_NO_PROGRESS        = 1U << 13,
 	TD_F_REGROW_LOGS	= 1U << 14,
+	TD_F_MMAP_KEEP		= 1U << 15,
 };
 
 enum {
 	FIO_RAND_BS_OFF		= 0,
+	FIO_RAND_BS1_OFF,
+	FIO_RAND_BS2_OFF,
 	FIO_RAND_VER_OFF,
 	FIO_RAND_MIX_OFF,
 	FIO_RAND_FILE_OFF,
@@ -99,6 +108,8 @@
 	FIO_DEDUPE_OFF,
 	FIO_RAND_POISSON_OFF,
 	FIO_RAND_ZONE_OFF,
+	FIO_RAND_POISSON2_OFF,
+	FIO_RAND_POISSON3_OFF,
 	FIO_RAND_NR_OFFS,
 };
 
@@ -121,7 +132,6 @@
  * Per-thread/process specific data. Only used for the network client
  * for now.
  */
-struct sk_out;
 void sk_out_assign(struct sk_out *);
 void sk_out_drop(void);
 
@@ -142,7 +152,7 @@
 	unsigned int thread_number;
 	unsigned int subjob_number;
 	unsigned int groupid;
-	struct thread_stat ts;
+	struct thread_stat ts __attribute__ ((aligned(8)));
 
 	int client_type;
 
@@ -158,10 +168,10 @@
 	struct thread_data *parent;
 
 	uint64_t stat_io_bytes[DDIR_RWDIR_CNT];
-	struct timeval bw_sample_time;
+	struct timespec bw_sample_time;
 
 	uint64_t stat_io_blocks[DDIR_RWDIR_CNT];
-	struct timeval iops_sample_time;
+	struct timespec iops_sample_time;
 
 	volatile int update_rusage;
 	struct fio_mutex *rusage_sem;
@@ -205,11 +215,9 @@
 	void *iolog_buf;
 	FILE *iolog_f;
 
-	char *sysfs_root;
-
 	unsigned long rand_seeds[FIO_RAND_NR_OFFS];
 
-	struct frand_state bsrange_state;
+	struct frand_state bsrange_state[DDIR_RWDIR_CNT];
 	struct frand_state verify_state;
 	struct frand_state trim_state;
 	struct frand_state delay_state;
@@ -233,6 +241,7 @@
 	 * to any of the available IO engines.
 	 */
 	struct ioengine_ops *io_ops;
+	int io_ops_init;
 
 	/*
 	 * IO engine private data and dlhandle.
@@ -281,9 +290,9 @@
 	unsigned long rate_bytes[DDIR_RWDIR_CNT];
 	unsigned long rate_blocks[DDIR_RWDIR_CNT];
 	unsigned long long rate_io_issue_bytes[DDIR_RWDIR_CNT];
-	struct timeval lastrate[DDIR_RWDIR_CNT];
-	int64_t last_usec;
-	struct frand_state poisson_state;
+	struct timespec lastrate[DDIR_RWDIR_CNT];
+	int64_t last_usec[DDIR_RWDIR_CNT];
+	struct frand_state poisson_state[DDIR_RWDIR_CNT];
 
 	/*
 	 * Enforced rate submission/completion workqueue
@@ -317,21 +326,21 @@
 	 */
 	struct frand_state random_state;
 
-	struct timeval start;	/* start of this loop */
-	struct timeval epoch;	/* time job was started */
+	struct timespec start;	/* start of this loop */
+	struct timespec epoch;	/* time job was started */
 	unsigned long long unix_epoch; /* Time job was started, unix epoch based. */
-	struct timeval last_issue;
+	struct timespec last_issue;
 	long time_offset;
-	struct timeval tv_cache;
-	struct timeval terminate_time;
-	unsigned int tv_cache_nr;
-	unsigned int tv_cache_mask;
-	unsigned int ramp_time_over;
+	struct timespec ts_cache;
+	struct timespec terminate_time;
+	unsigned int ts_cache_nr;
+	unsigned int ts_cache_mask;
+	bool ramp_time_over;
 
 	/*
 	 * Time since last latency_window was started
 	 */
-	struct timeval latency_ts;
+	struct timespec latency_ts;
 	unsigned int latency_qd;
 	unsigned int latency_qd_high;
 	unsigned int latency_qd_low;
@@ -406,6 +415,18 @@
 	struct steadystate_data ss;
 
 	char verror[FIO_VERROR_SIZE];
+
+#ifdef CONFIG_CUDA
+	/*
+	 * for GPU memory management
+	 */
+	int gpu_dev_cnt;
+	int gpu_dev_id;
+	CUdevice  cu_dev;
+	CUcontext cu_ctx;
+	CUdeviceptr dev_mem_ptr;
+#endif	
+
 };
 
 /*
@@ -492,7 +513,7 @@
 {
 	if (td->last_was_sync)
 		return 0;
-	if (td_write(td) || td_rw(td) || td->o.override_sync)
+	if (td_write(td) || td->o.override_sync)
 		return 1;
 
 	return 0;
@@ -518,11 +539,9 @@
 extern void fio_options_set_ioengine_opts(struct option *long_options, struct thread_data *td);
 extern void fio_options_dup_and_init(struct option *);
 extern void fio_options_mem_dupe(struct thread_data *);
-extern void options_mem_dupe(void *data, struct fio_option *options);
 extern void td_fill_rand_seeds(struct thread_data *);
 extern void td_fill_verify_state_seed(struct thread_data *);
 extern void add_job_opts(const char **, int);
-extern char *num2str(uint64_t, int, int, int, int);
 extern int ioengine_load(struct thread_data *);
 extern bool parse_dryrun(void);
 extern int fio_running_or_pending_io_threads(void);
@@ -580,7 +599,8 @@
 
 static inline void td_set_ioengine_flags(struct thread_data *td)
 {
-	td->flags |= (td->io_ops->flags << TD_ENG_FLAG_SHIFT);
+	td->flags = (~(TD_ENG_FLAG_MASK << TD_ENG_FLAG_SHIFT) & td->flags) |
+		    (td->io_ops->flags << TD_ENG_FLAG_SHIFT);
 }
 
 static inline bool td_ioengine_flagged(struct thread_data *td,
@@ -613,22 +633,19 @@
 extern void free_io_mem(struct thread_data *);
 extern void free_threads_shm(void);
 
+#ifdef FIO_INTERNAL
+#define PTR_ALIGN(ptr, mask)	\
+	(char *) (((uintptr_t) (ptr) + (mask)) & ~(mask))
+#endif
+
 /*
  * Reset stats after ramp time completes
  */
 extern void reset_all_stats(struct thread_data *);
 
-/*
- * blktrace support
- */
-#ifdef FIO_HAVE_BLKTRACE
-extern int is_blktrace(const char *, int *);
-extern int load_blktrace(struct thread_data *, const char *, int);
-#endif
-
 extern int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
 		   enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify,
-		   struct timeval *comp_time);
+		   struct timespec *comp_time);
 
 /*
  * Latency target helpers
@@ -637,6 +654,9 @@
 extern void lat_target_init(struct thread_data *);
 extern void lat_target_reset(struct thread_data *);
 
+/*
+ * Iterates all threads/processes within all the defined jobs
+ */
 #define for_each_td(td, i)	\
 	for ((i) = 0, (td) = &threads[0]; (i) < (int) thread_number; (i)++, (td)++)
 #define for_each_file(td, f, i)	\
diff -Nru fio-2.16/fio_time.h fio-3.1/fio_time.h
--- fio-2.16/fio_time.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/fio_time.h	2017-09-28 10:23:20.000000000 +0000
@@ -4,22 +4,24 @@
 #include "lib/types.h"
 
 struct thread_data;
-extern uint64_t utime_since(const struct timeval *,const  struct timeval *);
-extern uint64_t utime_since_now(const struct timeval *);
-extern uint64_t mtime_since(const struct timeval *, const struct timeval *);
-extern uint64_t mtime_since_now(const struct timeval *);
-extern uint64_t time_since_now(const struct timeval *);
+extern uint64_t ntime_since(const struct timespec *, const struct timespec *);
+extern uint64_t utime_since(const struct timespec *, const struct timespec *);
+extern uint64_t utime_since_now(const struct timespec *);
+extern uint64_t mtime_since(const struct timespec *, const struct timespec *);
+extern uint64_t mtime_since_now(const struct timespec *);
+extern uint64_t mtime_since_tv(const struct timeval *, const struct timeval *);
+extern uint64_t time_since_now(const struct timespec *);
 extern uint64_t time_since_genesis(void);
 extern uint64_t mtime_since_genesis(void);
 extern uint64_t utime_since_genesis(void);
 extern uint64_t usec_spin(unsigned int);
 extern uint64_t usec_sleep(struct thread_data *, unsigned long);
-extern void fill_start_time(struct timeval *);
+extern void fill_start_time(struct timespec *);
 extern void set_genesis_time(void);
 extern bool ramp_time_over(struct thread_data *);
 extern bool in_ramp_time(struct thread_data *);
 extern void fio_time_init(void);
-extern void timeval_add_msec(struct timeval *, unsigned int);
+extern void timespec_add_msec(struct timespec *, unsigned int);
 extern void set_epoch_time(struct thread_data *, int);
 
 #endif
diff -Nru fio-2.16/FIO-VERSION-GEN fio-3.1/FIO-VERSION-GEN
--- fio-2.16/FIO-VERSION-GEN	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/FIO-VERSION-GEN	2017-09-28 10:23:20.000000000 +0000
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-2.16
+DEF_VER=fio-3.1
 
 LF='
 '
diff -Nru fio-2.16/flist.h fio-3.1/flist.h
--- fio-2.16/flist.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/flist.h	2017-09-28 10:23:20.000000000 +0000
@@ -2,13 +2,7 @@
 #define _LINUX_FLIST_H
 
 #include <stdlib.h>
-
-#undef offsetof
-#ifdef __compiler_offsetof
-#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
-#else
-#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
-#endif
+#include <stddef.h>
 
 #define container_of(ptr, type, member) ({			\
 	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
diff -Nru fio-2.16/gclient.c fio-3.1/gclient.c
--- fio-2.16/gclient.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/gclient.c	2017-09-28 10:23:20.000000000 +0000
@@ -48,7 +48,7 @@
 	{ "PrintFile", GTK_STOCK_PRINT, "Print", "<Control>P", NULL, G_CALLBACK(results_print) },
 	{ "CloseFile", GTK_STOCK_CLOSE, "Close", "<Control>W", NULL, G_CALLBACK(results_close) },
 };
-static gint results_nmenu_items = sizeof(results_menu_items) / sizeof(results_menu_items[0]);
+static gint results_nmenu_items = ARRAY_SIZE(results_menu_items);
 
 static const gchar *results_ui_string = " \
 	<ui> \
@@ -364,29 +364,11 @@
 	sprintf(tmp, "%u", je->files_open);
 	gtk_entry_set_text(GTK_ENTRY(ge->eta.files), tmp);
 
-#if 0
-	if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) {
-	if (je->m_rate || je->t_rate) {
-		char *tr, *mr;
-
-		mr = num2str(je->m_rate, 4, 0, i2p);
-		tr = num2str(je->t_rate, 4, 0, i2p);
-		gtk_entry_set_text(GTK_ENTRY(ge->eta);
-		p += sprintf(p, ", CR=%s/%s KB/s", tr, mr);
-		free(tr);
-		free(mr);
-	} else if (je->m_iops || je->t_iops)
-		p += sprintf(p, ", CR=%d/%d IOPS", je->t_iops, je->m_iops);
-
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cr_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cr_iops), "---");
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cw_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cw_iops), "---");
-#endif
-
 	if (je->eta_sec != INT_MAX && je->nr_running) {
 		char *iops_str[DDIR_RWDIR_CNT];
 		char *rate_str[DDIR_RWDIR_CNT];
+		char *rate_alt[DDIR_RWDIR_CNT];
+		char tmp[128];
 		int i;
 
 		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
@@ -397,19 +379,26 @@
 			sprintf(output, "%3.1f%% done", perc);
 		}
 
-		rate_str[0] = num2str(je->rate[0], 5, 10, i2p, 0);
-		rate_str[1] = num2str(je->rate[1], 5, 10, i2p, 0);
-		rate_str[2] = num2str(je->rate[2], 5, 10, i2p, 0);
-
-		iops_str[0] = num2str(je->iops[0], 4, 1, 0, 0);
-		iops_str[1] = num2str(je->iops[1], 4, 1, 0, 0);
-		iops_str[2] = num2str(je->iops[2], 4, 1, 0, 0);
-
-		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_bw), rate_str[0]);
+		iops_str[0] = num2str(je->iops[0], 4, 1, 0, N2S_PERSEC);
+		iops_str[1] = num2str(je->iops[1], 4, 1, 0, N2S_PERSEC);
+		iops_str[2] = num2str(je->iops[2], 4, 1, 0, N2S_PERSEC);
+
+		rate_str[0] = num2str(je->rate[0], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[0] = num2str(je->rate[0], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[0], rate_alt[0]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_iops), iops_str[0]);
-		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_bw), rate_str[1]);
+
+		rate_str[1] = num2str(je->rate[1], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[1] = num2str(je->rate[1], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[1], rate_alt[1]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_iops), iops_str[1]);
-		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_bw), rate_str[2]);
+
+		rate_str[2] = num2str(je->rate[2], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[2] = num2str(je->rate[2], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[2], rate_alt[2]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_iops), iops_str[2]);
 
 		graph_add_xy_data(ge->graphs.iops_graph, ge->graphs.read_iops, je->elapsed_sec, je->iops[0], iops_str[0]);
@@ -421,6 +410,7 @@
 
 		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 			free(rate_str[i]);
+			free(rate_alt[i]);
 			free(iops_str[i]);
 		}
 	}
@@ -457,31 +447,13 @@
 		eta_to_str(eta_str, je->eta_sec);
 	}
 
-#if 0
-	if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) {
-	if (je->m_rate || je->t_rate) {
-		char *tr, *mr;
-
-		mr = num2str(je->m_rate, 4, 0, i2p);
-		tr = num2str(je->t_rate, 4, 0, i2p);
-		gtk_entry_set_text(GTK_ENTRY(ui->eta);
-		p += sprintf(p, ", CR=%s/%s KB/s", tr, mr);
-		free(tr);
-		free(mr);
-	} else if (je->m_iops || je->t_iops)
-		p += sprintf(p, ", CR=%d/%d IOPS", je->t_iops, je->m_iops);
-
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cr_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cr_iops), "---");
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cw_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cw_iops), "---");
-#endif
-
 	entry_set_int_value(ui->eta.jobs, je->nr_running);
 
 	if (je->eta_sec != INT_MAX && je->nr_running) {
-		char *iops_str[3];
-		char *rate_str[3];
+		char *iops_str[DDIR_RWDIR_CNT];
+		char *rate_str[DDIR_RWDIR_CNT];
+		char *rate_alt[DDIR_RWDIR_CNT];
+		char tmp[128];
 
 		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
 			strcpy(output, "-.-% done");
@@ -491,19 +463,26 @@
 			sprintf(output, "%3.1f%% done", perc);
 		}
 
-		rate_str[0] = num2str(je->rate[0], 5, 10, i2p, 0);
-		rate_str[1] = num2str(je->rate[1], 5, 10, i2p, 0);
-		rate_str[2] = num2str(je->rate[2], 5, 10, i2p, 0);
-
-		iops_str[0] = num2str(je->iops[0], 4, 1, 0, 0);
-		iops_str[1] = num2str(je->iops[1], 4, 1, 0, 0);
-		iops_str[2] = num2str(je->iops[2], 4, 1, 0, 0);
-
-		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_bw), rate_str[0]);
+		iops_str[0] = num2str(je->iops[0], 4, 1, 0, N2S_PERSEC);
+		iops_str[1] = num2str(je->iops[1], 4, 1, 0, N2S_PERSEC);
+		iops_str[2] = num2str(je->iops[2], 4, 1, 0, N2S_PERSEC);
+
+		rate_str[0] = num2str(je->rate[0], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[0] = num2str(je->rate[0], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[0], rate_alt[0]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_iops), iops_str[0]);
-		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_bw), rate_str[1]);
+
+		rate_str[1] = num2str(je->rate[1], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[1] = num2str(je->rate[1], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[1], rate_alt[1]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_iops), iops_str[1]);
-		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_bw), rate_str[2]);
+
+		rate_str[2] = num2str(je->rate[2], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[2] = num2str(je->rate[2], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[2], rate_alt[2]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_iops), iops_str[2]);
 
 		graph_add_xy_data(ui->graphs.iops_graph, ui->graphs.read_iops, je->elapsed_sec, je->iops[0], iops_str[0]);
@@ -515,6 +494,7 @@
 
 		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 			free(rate_str[i]);
+			free(rate_alt[i]);
 			free(iops_str[i]);
 		}
 	}
@@ -592,6 +572,7 @@
 	struct thread_options *o;
 	char *c1, *c2, *c3, *c4;
 	char tmp[80];
+	int i2p;
 
 	p->thread_number = le32_to_cpu(p->thread_number);
 	p->groupid = le32_to_cpu(p->groupid);
@@ -605,11 +586,13 @@
 	sprintf(tmp, "%s %s", o->odirect ? "direct" : "buffered", ddir_str(o->td_ddir));
 	multitext_add_entry(&ge->eta.iotype, tmp);
 
-	c1 = fio_uint_to_kmg(o->min_bs[DDIR_READ]);
-	c2 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]);
-	c3 = fio_uint_to_kmg(o->min_bs[DDIR_READ]);
-	c4 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]);
-	sprintf(tmp, "%s-%s/%s-%s", c1, c2, c3, c4);
+	i2p = is_power_of_2(o->kb_base);
+	c1 = num2str(o->min_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE);
+	c2 = num2str(o->max_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE);
+	c3 = num2str(o->min_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE);
+	c4 = num2str(o->max_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE);
+
+	sprintf(tmp, "%s-%s,%s-%s", c1, c2, c3, c4);
 	free(c1);
 	free(c2);
 	free(c3);
@@ -947,18 +930,21 @@
 static void gfio_show_latency_buckets(struct gfio_client *gc, GtkWidget *vbox,
 				      struct thread_stat *ts)
 {
-	double io_u_lat[FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR];
-	const char *ranges[] = { "2u", "4u", "10u", "20u", "50u", "100u",
-				 "250u", "500u", "750u", "1m", "2m",
-				 "4m", "10m", "20m", "50m", "100m",
-				 "250m", "500m", "750m", "1s", "2s", ">= 2s" };
+	double io_u_lat[FIO_IO_U_LAT_N_NR + FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR];
+	const char *ranges[] = { "2ns", "4ns", "10ns", "20ns", "50ns", "100ns",
+				 "250ns", "500ns", "750ns", "1000ns", "2us",
+				 "4us", "10us", "20us", "50us", "100us",
+				 "250us", "500us", "750us", "1ms", "2ms",
+				 "4ms", "10ms", "20ms", "50ms", "100ms",
+				 "250ms", "500ms", "750ms", "1s", "2s", ">= 2s" };
 	int start, end, i;
 	const int total = FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR;
 	GtkWidget *frame, *tree_view, *hbox, *completion_vbox, *drawing_area;
 	struct gui_entry *ge = gc->ge;
 
-	stat_calc_lat_u(ts, io_u_lat);
-	stat_calc_lat_m(ts, &io_u_lat[FIO_IO_U_LAT_U_NR]);
+	stat_calc_lat_n(ts, io_u_lat);
+	stat_calc_lat_u(ts, &io_u_lat[FIO_IO_U_LAT_N_NR]);
+	stat_calc_lat_m(ts, &io_u_lat[FIO_IO_U_LAT_N_NR + FIO_IO_U_LAT_U_NR]);
 
 	/*
 	 * Found out which first bucket has entries, and which last bucket
@@ -980,7 +966,7 @@
 		return;
 
 	tree_view = gfio_output_lat_buckets(&io_u_lat[start], &ranges[start], end - start + 1);
-	ge->lat_bucket_graph = setup_lat_bucket_graph("Latency Buckets", &io_u_lat[start], &ranges[start], end - start + 1, 700.0, 300.0);
+	ge->lat_bucket_graph = setup_lat_bucket_graph("Latency buckets", &io_u_lat[start], &ranges[start], end - start + 1, 700.0, 300.0);
 
 	frame = gtk_frame_new("Latency buckets");
 	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
@@ -1000,19 +986,21 @@
 	gtk_box_pack_start(GTK_BOX(hbox), tree_view, TRUE, TRUE, 3);
 }
 
-static void gfio_show_lat(GtkWidget *vbox, const char *name, unsigned long min,
-			  unsigned long max, double mean, double dev)
+static void gfio_show_lat(GtkWidget *vbox, const char *name, unsigned long long min,
+			  unsigned long long max, double mean, double dev)
 {
-	const char *base = "(usec)";
+	const char *base = "(nsec)";
 	GtkWidget *hbox, *label, *frame;
 	char *minp, *maxp;
 	char tmp[64];
 
-	if (usec_to_msec(&min, &max, &mean, &dev))
+	if (nsec_to_msec(&min, &max, &mean, &dev))
 		base = "(msec)";
+	else if (nsec_to_usec(&min, &max, &mean, &dev))
+		base = "(usec)";
 
-	minp = num2str(min, 6, 1, 0, 0);
-	maxp = num2str(max, 6, 1, 0, 0);
+	minp = num2str(min, 6, 1, 0, N2S_NONE);
+	maxp = num2str(max, 6, 1, 0, N2S_NONE);
 
 	sprintf(tmp, "%s %s", name, base);
 	frame = gtk_frame_new(tmp);
@@ -1036,7 +1024,7 @@
 	free(maxp);
 }
 
-static GtkWidget *gfio_output_clat_percentiles(unsigned int *ovals,
+static GtkWidget *gfio_output_clat_percentiles(unsigned long long *ovals,
 					       fio_fp64_t *plist,
 					       unsigned int len,
 					       const char *base,
@@ -1047,10 +1035,10 @@
 	GtkTreeSelection *selection;
 	GtkListStore *model;
 	GtkTreeIter iter;
-	int i;
+	int i, j;
 
 	for (i = 0; i < len; i++)
-		types[i] = G_TYPE_INT;
+		types[i] = G_TYPE_ULONG;
 
 	model = gtk_list_store_newv(len, types);
 
@@ -1073,15 +1061,15 @@
 	gtk_list_store_append(model, &iter);
 
 	for (i = 0; i < len; i++) {
-		if (scale)
+		for (j = 0; j < scale; j++)
 			ovals[i] = (ovals[i] + 999) / 1000;
-		gtk_list_store_set(model, &iter, i, ovals[i], -1);
+		gtk_list_store_set(model, &iter, i, (unsigned long) ovals[i], -1);
 	}
 
 	return tree_view;
 }
 
-static struct graph *setup_clat_graph(char *title, unsigned int *ovals,
+static struct graph *setup_clat_graph(char *title, unsigned long long *ovals,
 				      fio_fp64_t *plist,
 				      unsigned int len,
 				      double xdim, double ydim)
@@ -1113,7 +1101,8 @@
 	unsigned int *io_u_plat = ts->io_u_plat[ddir];
 	unsigned long nr = ts->clat_stat[ddir].samples;
 	fio_fp64_t *plist = ts->percentile_list;
-	unsigned int *ovals, len, minv, maxv, scale_down;
+	unsigned int len, scale_down;
+	unsigned long long *ovals, minv, maxv;
 	const char *base;
 	GtkWidget *tree_view, *frame, *hbox, *drawing_area, *completion_vbox;
 	struct gui_entry *ge = gc->ge;
@@ -1124,18 +1113,25 @@
 		goto out;
 
 	/*
-	 * We default to usecs, but if the value range is such that we
-	 * should scale down to msecs, do that.
+	 * We default to nsecs, but if the value range is such that we
+	 * should scale down to usecs or msecs, do that.
 	 */
-	if (minv > 2000 && maxv > 99999) {
-		scale_down = 1;
+        if (minv > 2000000 && maxv > 99999999ULL) {
+                scale_down = 2;
 		base = "msec";
-	} else {
-		scale_down = 0;
+        } else if (minv > 2000 && maxv > 99999) {
+                scale_down = 1;
 		base = "usec";
-	}
+        } else {
+                scale_down = 0;
+		base = "nsec";
+        }
+
+	if (ts->clat_percentiles)
+		sprintf(tmp, "Completion percentiles (%s)", base);
+	else
+		sprintf(tmp, "Latency percentiles (%s)", base);
 
-	sprintf(tmp, "Completion percentiles (%s)", base);
 	tree_view = gfio_output_clat_percentiles(ovals, plist, len, base, scale_down);
 	ge->clat_graph = setup_clat_graph(tmp, ovals, plist, len, 700.0, 300.0);
 
@@ -1169,11 +1165,13 @@
 {
 	const char *ddir_label[3] = { "Read", "Write", "Trim" };
 	GtkWidget *frame, *label, *box, *vbox, *main_vbox;
-	unsigned long min[3], max[3], runt;
+	unsigned long long min[3], max[3];
+	unsigned long runt;
 	unsigned long long bw, iops;
 	unsigned int flags = 0;
 	double mean[3], dev[3];
-	char *io_p, *bw_p, *iops_p;
+	char *io_p, *io_palt, *bw_p, *bw_palt, *iops_p;
+	char tmp[128];
 	int i2p;
 
 	if (!ts->runtime[ddir])
@@ -1183,11 +1181,9 @@
 	runt = ts->runtime[ddir];
 
 	bw = (1000 * ts->io_bytes[ddir]) / runt;
-	io_p = num2str(ts->io_bytes[ddir], 6, 1, i2p, 8);
-	bw_p = num2str(bw, 6, 1, i2p, ts->unit_base);
 
 	iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt;
-	iops_p = num2str(iops, 6, 1, 0, 0);
+	iops_p = num2str(iops, 4, 1, 0, N2S_PERSEC);
 
 	box = gtk_hbox_new(FALSE, 3);
 	gtk_box_pack_start(GTK_BOX(mbox), box, TRUE, FALSE, 3);
@@ -1202,9 +1198,17 @@
 	gtk_box_pack_start(GTK_BOX(main_vbox), box, TRUE, FALSE, 3);
 
 	label = new_info_label_in_frame(box, "IO");
-	gtk_label_set_text(GTK_LABEL(label), io_p);
+	io_p = num2str(ts->io_bytes[ddir], 4, 1, i2p, N2S_BYTE);
+	io_palt = num2str(ts->io_bytes[ddir], 4, 1, !i2p, N2S_BYTE);
+	snprintf(tmp, sizeof(tmp), "%s (%s)", io_p, io_palt);
+	gtk_label_set_text(GTK_LABEL(label), tmp);
+
 	label = new_info_label_in_frame(box, "Bandwidth");
-	gtk_label_set_text(GTK_LABEL(label), bw_p);
+	bw_p = num2str(bw, 4, 1, i2p, ts->unit_base);
+	bw_palt = num2str(bw, 4, 1, !i2p, ts->unit_base);
+	snprintf(tmp, sizeof(tmp), "%s (%s)", bw_p, bw_palt);
+	gtk_label_set_text(GTK_LABEL(label), tmp);
+
 	label = new_info_label_in_frame(box, "IOPS");
 	gtk_label_set_text(GTK_LABEL(label), iops_p);
 	label = new_info_label_in_frame(box, "Runtime (msec)");
@@ -1212,7 +1216,7 @@
 
 	if (calc_lat(&ts->bw_stat[ddir], &min[0], &max[0], &mean[0], &dev[0])) {
 		double p_of_agg = 100.0;
-		const char *bw_str = "KB";
+		const char *bw_str = "KiB/s";
 		char tmp[32];
 
 		if (rs->agg[ddir]) {
@@ -1221,14 +1225,21 @@
 				p_of_agg = 100.0;
 		}
 
-		if (mean[0] > 999999.9) {
-			min[0] /= 1000.0;
-			max[0] /= 1000.0;
-			mean[0] /= 1000.0;
-			dev[0] /= 1000.0;
-			bw_str = "MB";
+		if (mean[0] > 1073741824.9) {
+			min[0] /= 1048576.0;
+			max[0] /= 1048576.0;
+			mean[0] /= 1048576.0;
+			dev[0] /= 1048576.0;
+			bw_str = "GiB/s";
 		}
 
+		if (mean[0] > 1047575.9) {
+			min[0] /= 1024.0;
+			max[0] /= 1024.0;
+			mean[0] /= 1024.0;
+			dev[0] /= 1024.0;
+			bw_str = "MiB/s";
+		}
 		sprintf(tmp, "Bandwidth (%s)", bw_str);
 		frame = gtk_frame_new(tmp);
 		gtk_box_pack_start(GTK_BOX(main_vbox), frame, FALSE, FALSE, 5);
@@ -1278,6 +1289,8 @@
 
 	free(io_p);
 	free(bw_p);
+	free(io_palt);
+	free(bw_palt);
 	free(iops_p);
 }
 
diff -Nru fio-2.16/gettime.c fio-3.1/gettime.c
--- fio-2.16/gettime.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/gettime.c	2017-09-28 10:23:20.000000000 +0000
@@ -15,19 +15,22 @@
 
 #if defined(ARCH_HAVE_CPU_CLOCK)
 #ifndef ARCH_CPU_CLOCK_CYCLES_PER_USEC
-static unsigned long cycles_per_usec;
-static unsigned long inv_cycles_per_usec;
-static uint64_t max_cycles_for_mult;
+static unsigned long cycles_per_msec;
+static unsigned long long cycles_start;
+static unsigned long long clock_mult;
+static unsigned long long max_cycles_mask;
+static unsigned long long nsecs_for_max_cycles;
+static unsigned int clock_shift;
+static unsigned int max_cycles_shift;
+#define MAX_CLOCK_SEC 60*60
 #endif
 #ifdef ARCH_CPU_CLOCK_WRAPS
-static unsigned long long cycles_start, cycles_wrap;
+static unsigned int cycles_wrap;
 #endif
 #endif
-int tsc_reliable = 0;
+bool tsc_reliable = false;
 
 struct tv_valid {
-	uint64_t last_cycles;
-	int last_tv_valid;
 	int warned;
 };
 #ifdef ARCH_HAVE_CPU_CLOCK
@@ -143,31 +146,31 @@
 }
 #endif
 
-static void __fio_gettime(struct timeval *tp)
+static void __fio_gettime(struct timespec *tp)
 {
 	switch (fio_clock_source) {
 #ifdef CONFIG_GETTIMEOFDAY
-	case CS_GTOD:
-		gettimeofday(tp, NULL);
+	case CS_GTOD: {
+		struct timeval tv;
+		gettimeofday(&tv, NULL);
+
+		tp->tv_sec = tv.tv_sec;
+		tp->tv_nsec = tv.tv_usec * 1000;
 		break;
+		}
 #endif
 #ifdef CONFIG_CLOCK_GETTIME
 	case CS_CGETTIME: {
-		struct timespec ts;
-
-		if (fill_clock_gettime(&ts) < 0) {
+		if (fill_clock_gettime(tp) < 0) {
 			log_err("fio: clock_gettime fails\n");
 			assert(0);
 		}
-
-		tp->tv_sec = ts.tv_sec;
-		tp->tv_usec = ts.tv_nsec / 1000;
 		break;
 		}
 #endif
 #ifdef ARCH_HAVE_CPU_CLOCK
 	case CS_CPUCLOCK: {
-		uint64_t usecs, t;
+		uint64_t nsecs, t, multiples;
 		struct tv_valid *tv;
 
 #ifdef CONFIG_TLS_THREAD
@@ -184,21 +187,17 @@
 			log_err("fio: double CPU clock wrap\n");
 			tv->warned = 1;
 		}
-
-		t -= cycles_start;
 #endif
-		tv->last_cycles = t;
-		tv->last_tv_valid = 1;
 #ifdef ARCH_CPU_CLOCK_CYCLES_PER_USEC
-		usecs = t / ARCH_CPU_CLOCK_CYCLES_PER_USEC;
+		nsecs = t / ARCH_CPU_CLOCK_CYCLES_PER_USEC * 1000;
 #else
-		if (t < max_cycles_for_mult)
-			usecs = (t * inv_cycles_per_usec) / 16777216UL;
-		else
-			usecs = t / cycles_per_usec;
+		t -= cycles_start;
+		multiples = t >> max_cycles_shift;
+		nsecs = multiples * nsecs_for_max_cycles;
+		nsecs += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
 #endif
-		tp->tv_sec = usecs / 1000000;
-		tp->tv_usec = usecs % 1000000;
+		tp->tv_sec = nsecs / 1000000000ULL;
+		tp->tv_nsec = nsecs % 1000000000ULL;
 		break;
 		}
 #endif
@@ -209,9 +208,9 @@
 }
 
 #ifdef FIO_DEBUG_TIME
-void fio_gettime(struct timeval *tp, void *caller)
+void fio_gettime(struct timespec *tp, void *caller)
 #else
-void fio_gettime(struct timeval *tp, void fio_unused *caller)
+void fio_gettime(struct timespec *tp, void fio_unused *caller)
 #endif
 {
 #ifdef FIO_DEBUG_TIME
@@ -227,9 +226,9 @@
 }
 
 #if defined(ARCH_HAVE_CPU_CLOCK) && !defined(ARCH_CPU_CLOCK_CYCLES_PER_USEC)
-static unsigned long get_cycles_per_usec(void)
+static unsigned long get_cycles_per_msec(void)
 {
-	struct timeval s, e;
+	struct timespec s, e;
 	uint64_t c_s, c_e;
 	enum fio_cs old_cs = fio_clock_source;
 	uint64_t elapsed;
@@ -253,7 +252,7 @@
 	} while (1);
 
 	fio_clock_source = old_cs;
-	return (c_e - c_s) / elapsed;
+	return (c_e - c_s) * 1000 / elapsed;
 }
 
 #define NR_TIME_ITERS	50
@@ -262,12 +261,13 @@
 {
 	double delta, mean, S;
 	uint64_t minc, maxc, avg, cycles[NR_TIME_ITERS];
-	int i, samples;
+	int i, samples, sft = 0;
+	unsigned long long tmp, max_ticks, max_mult;
 
-	cycles[0] = get_cycles_per_usec();
+	cycles[0] = get_cycles_per_msec();
 	S = delta = mean = 0.0;
 	for (i = 0; i < NR_TIME_ITERS; i++) {
-		cycles[i] = get_cycles_per_usec();
+		cycles[i] = get_cycles_per_msec();
 		delta = cycles[i] - mean;
 		if (delta) {
 			mean += delta / (i + 1.0);
@@ -304,19 +304,67 @@
 		dprint(FD_TIME, "cycles[%d]=%llu\n", i, (unsigned long long) cycles[i]);
 
 	avg /= samples;
+	cycles_per_msec = avg;
 	dprint(FD_TIME, "avg: %llu\n", (unsigned long long) avg);
 	dprint(FD_TIME, "min=%llu, max=%llu, mean=%f, S=%f\n",
 			(unsigned long long) minc,
 			(unsigned long long) maxc, mean, S);
 
-	cycles_per_usec = avg;
-	inv_cycles_per_usec = 16777216UL / cycles_per_usec;
-	max_cycles_for_mult = ~0ULL / inv_cycles_per_usec;
-	dprint(FD_TIME, "inv_cycles_per_usec=%lu\n", inv_cycles_per_usec);
-#ifdef ARCH_CPU_CLOCK_WRAPS
+	max_ticks = MAX_CLOCK_SEC * cycles_per_msec * 1000ULL;
+	max_mult = ULLONG_MAX / max_ticks;
+	dprint(FD_TIME, "\n\nmax_ticks=%llu, __builtin_clzll=%d, "
+			"max_mult=%llu\n", max_ticks,
+			__builtin_clzll(max_ticks), max_mult);
+
+        /*
+         * Find the largest shift count that will produce
+         * a multiplier that does not exceed max_mult
+         */
+        tmp = max_mult * cycles_per_msec / 1000000;
+        while (tmp > 1) {
+                tmp >>= 1;
+                sft++;
+                dprint(FD_TIME, "tmp=%llu, sft=%u\n", tmp, sft);
+        }
+
+	clock_shift = sft;
+	clock_mult = (1ULL << sft) * 1000000 / cycles_per_msec;
+	dprint(FD_TIME, "clock_shift=%u, clock_mult=%llu\n", clock_shift,
+							clock_mult);
+
+	/*
+	 * Find the greatest power of 2 clock ticks that is less than the
+	 * ticks in MAX_CLOCK_SEC_2STAGE
+	 */
+	max_cycles_shift = max_cycles_mask = 0;
+	tmp = MAX_CLOCK_SEC * 1000ULL * cycles_per_msec;
+	dprint(FD_TIME, "tmp=%llu, max_cycles_shift=%u\n", tmp,
+							max_cycles_shift);
+	while (tmp > 1) {
+		tmp >>= 1;
+		max_cycles_shift++;
+		dprint(FD_TIME, "tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+	}
+	/*
+	 * if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_msec
+	 * here we will have a discontinuity every
+	 * (1ULL << max_cycles_shift) cycles
+	 */
+	nsecs_for_max_cycles = ((1ULL << max_cycles_shift) * clock_mult)
+					>> clock_shift;
+
+	/* Use a bitmask to calculate ticks % (1ULL << max_cycles_shift) */
+	for (tmp = 0; tmp < max_cycles_shift; tmp++)
+		max_cycles_mask |= 1ULL << tmp;
+
+	dprint(FD_TIME, "max_cycles_shift=%u, 2^max_cycles_shift=%llu, "
+			"nsecs_for_max_cycles=%llu, "
+			"max_cycles_mask=%016llx\n",
+			max_cycles_shift, (1ULL << max_cycles_shift),
+			nsecs_for_max_cycles, max_cycles_mask);
+
 	cycles_start = get_cpu_clock();
 	dprint(FD_TIME, "cycles_start=%llu\n", cycles_start);
-#endif
 	return 0;
 }
 #else
@@ -365,7 +413,7 @@
 	fio_clock_source_inited = fio_clock_source;
 
 	if (calibrate_cpu_clock())
-		tsc_reliable = 0;
+		tsc_reliable = false;
 
 	/*
 	 * If the arch sets tsc_reliable != 0, then it must be good enough
@@ -377,14 +425,35 @@
 			fio_clock_source = CS_CPUCLOCK;
 	} else if (fio_clock_source == CS_CPUCLOCK)
 		log_info("fio: clocksource=cpu may not be reliable\n");
+	dprint(FD_TIME, "gettime: clocksource=%d\n", (int) fio_clock_source);
 }
 
-uint64_t utime_since(const struct timeval *s, const struct timeval *e)
+uint64_t ntime_since(const struct timespec *s, const struct timespec *e)
+{
+       int64_t sec, nsec;
+
+       sec = e->tv_sec - s->tv_sec;
+       nsec = e->tv_nsec - s->tv_nsec;
+       if (sec > 0 && nsec < 0) {
+	       sec--;
+	       nsec += 1000000000LL;
+       }
+
+       /*
+	* time warp bug on some kernels?
+	*/
+       if (sec < 0 || (sec == 0 && nsec < 0))
+	       return 0;
+
+       return nsec + (sec * 1000000000LL);
+}
+
+uint64_t utime_since(const struct timespec *s, const struct timespec *e)
 {
 	int64_t sec, usec;
 
 	sec = e->tv_sec - s->tv_sec;
-	usec = e->tv_usec - s->tv_usec;
+	usec = (e->tv_nsec - s->tv_nsec) / 1000;
 	if (sec > 0 && usec < 0) {
 		sec--;
 		usec += 1000000;
@@ -399,20 +468,26 @@
 	return usec + (sec * 1000000);
 }
 
-uint64_t utime_since_now(const struct timeval *s)
+uint64_t utime_since_now(const struct timespec *s)
 {
-	struct timeval t;
+	struct timespec t;
+#ifdef FIO_DEBUG_TIME
+	void *p = __builtin_return_address(0);
 
+	fio_gettime(&t, p);
+#else
 	fio_gettime(&t, NULL);
+#endif
+
 	return utime_since(s, &t);
 }
 
-uint64_t mtime_since(const struct timeval *s, const struct timeval *e)
+uint64_t mtime_since_tv(const struct timeval *s, const struct timeval *e)
 {
-	long sec, usec;
+	int64_t sec, usec;
 
 	sec = e->tv_sec - s->tv_sec;
-	usec = e->tv_usec - s->tv_usec;
+	usec = (e->tv_usec - s->tv_usec);
 	if (sec > 0 && usec < 0) {
 		sec--;
 		usec += 1000000;
@@ -426,16 +501,40 @@
 	return sec + usec;
 }
 
-uint64_t mtime_since_now(const struct timeval *s)
+uint64_t mtime_since_now(const struct timespec *s)
 {
-	struct timeval t;
+	struct timespec t;
+#ifdef FIO_DEBUG_TIME
 	void *p = __builtin_return_address(0);
 
 	fio_gettime(&t, p);
+#else
+	fio_gettime(&t, NULL);
+#endif
+
 	return mtime_since(s, &t);
 }
 
-uint64_t time_since_now(const struct timeval *s)
+uint64_t mtime_since(const struct timespec *s, const struct timespec *e)
+{
+	int64_t sec, usec;
+
+	sec = e->tv_sec - s->tv_sec;
+	usec = (e->tv_nsec - s->tv_nsec) / 1000;
+	if (sec > 0 && usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+
+	if (sec < 0 || (sec == 0 && usec < 0))
+		return 0;
+
+	sec *= 1000;
+	usec /= 1000;
+	return sec + usec;
+}
+
+uint64_t time_since_now(const struct timespec *s)
 {
 	return mtime_since_now(s) / 1000;
 }
@@ -444,7 +543,7 @@
     defined(CONFIG_SFAA)
 
 #define CLOCK_ENTRIES_DEBUG	100000
-#define CLOCK_ENTRIES_TEST	10000
+#define CLOCK_ENTRIES_TEST	1000
 
 struct clock_entry {
 	uint32_t seq;
diff -Nru fio-2.16/gettime.h fio-3.1/gettime.h
--- fio-2.16/gettime.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/gettime.h	2017-09-28 10:23:20.000000000 +0000
@@ -13,27 +13,27 @@
 	CS_INVAL,
 };
 
-extern void fio_gettime(struct timeval *, void *);
+extern void fio_gettime(struct timespec *, void *);
 extern void fio_gtod_init(void);
 extern void fio_clock_init(void);
 extern int fio_start_gtod_thread(void);
 extern int fio_monotonic_clocktest(int debug);
 extern void fio_local_clock_init(int);
 
-extern struct timeval *fio_tv;
+extern struct timespec *fio_ts;
 
-static inline int fio_gettime_offload(struct timeval *tv)
+static inline int fio_gettime_offload(struct timespec *ts)
 {
 	time_t last_sec;
 
-	if (!fio_tv)
+	if (!fio_ts)
 		return 0;
 
 	do {
 		read_barrier();
-		last_sec = tv->tv_sec = fio_tv->tv_sec;
-		tv->tv_usec = fio_tv->tv_usec;
-	} while (fio_tv->tv_sec != last_sec);
+		last_sec = ts->tv_sec = fio_ts->tv_sec;
+		ts->tv_nsec = fio_ts->tv_nsec;
+	} while (fio_ts->tv_sec != last_sec);
 
 	return 1;
 }
diff -Nru fio-2.16/gettime-thread.c fio-3.1/gettime-thread.c
--- fio-2.16/gettime-thread.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/gettime-thread.c	2017-09-28 10:23:20.000000000 +0000
@@ -6,30 +6,30 @@
 #include "fio.h"
 #include "smalloc.h"
 
-struct timeval *fio_tv = NULL;
+struct timespec *fio_ts = NULL;
 int fio_gtod_offload = 0;
 static pthread_t gtod_thread;
 static os_cpu_mask_t fio_gtod_cpumask;
 
 void fio_gtod_init(void)
 {
-	if (fio_tv)
+	if (fio_ts)
 		return;
 
-	fio_tv = smalloc(sizeof(struct timeval));
-	if (!fio_tv)
+	fio_ts = smalloc(sizeof(*fio_ts));
+	if (!fio_ts)
 		log_err("fio: smalloc pool exhausted\n");
 }
 
 static void fio_gtod_update(void)
 {
-	if (fio_tv) {
+	if (fio_ts) {
 		struct timeval __tv;
 
 		gettimeofday(&__tv, NULL);
-		fio_tv->tv_sec = __tv.tv_sec;
+		fio_ts->tv_sec = __tv.tv_sec;
 		write_barrier();
-		fio_tv->tv_usec = __tv.tv_usec;
+		fio_ts->tv_nsec = __tv.tv_usec * 1000;
 		write_barrier();
 	}
 }
diff -Nru fio-2.16/gfio.c fio-3.1/gfio.c
--- fio-2.16/gfio.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/gfio.c	2017-09-28 10:23:20.000000000 +0000
@@ -1215,7 +1215,7 @@
 {
 	const char *authors[] = {
 		"Jens Axboe <axboe@kernel.dk>",
-		"Stephen Carmeron <stephenmcameron@gmail.com>",
+		"Stephen Cameron <stephenmcameron@gmail.com>",
 		NULL
 	};
 	const char *license[] = {
@@ -1240,10 +1240,10 @@
 		"program-name", "gfio",
 		"comments", "Gtk2 UI for fio",
 		"license", license_trans,
-		"website", "http://git.kernel.dk/?p=fio.git;a=summary",
+		"website", "http://git.kernel.dk/cgit/fio/",
 		"authors", authors,
 		"version", fio_version_string,
-		"copyright", "© 2012 Jens Axboe <axboe@kernel.dk>",
+		"copyright", "© 2012-2017 Jens Axboe <axboe@kernel.dk>",
 		"logo-icon-name", "fio",
 		/* Must be last: */
 		"wrap-license", TRUE,
@@ -1271,7 +1271,7 @@
 	{ "Quit", GTK_STOCK_QUIT, NULL,   "<Control>Q", NULL, G_CALLBACK(quit_clicked) },
 	{ "About", GTK_STOCK_ABOUT, NULL,  NULL, NULL, G_CALLBACK(about_dialog) },
 };
-static gint nmenu_items = sizeof(menu_items) / sizeof(menu_items[0]);
+static gint nmenu_items = ARRAY_SIZE(menu_items);
 
 static const gchar *ui_string = " \
 	<ui> \
@@ -1386,7 +1386,7 @@
 	g_signal_connect(ge->eta.names, "changed", G_CALLBACK(combo_entry_changed), ge);
 	g_signal_connect(ge->eta.names, "destroy", G_CALLBACK(combo_entry_destroy), ge);
 	ge->eta.iotype.entry = new_info_entry_in_frame(probe_box, "IO");
-	ge->eta.bs.entry = new_info_entry_in_frame(probe_box, "Blocksize (Read/Write)");
+	ge->eta.bs.entry = new_info_entry_in_frame(probe_box, "Blocksize (Read/Write/Trim)");
 	ge->eta.ioengine.entry = new_info_entry_in_frame(probe_box, "IO Engine");
 	ge->eta.iodepth.entry = new_info_entry_in_frame(probe_box, "IO Depth");
 	ge->eta.jobs = new_info_entry_in_frame(probe_box, "Jobs");
@@ -1395,11 +1395,11 @@
 	probe_box = gtk_hbox_new(FALSE, 3);
 	gtk_box_pack_start(GTK_BOX(probe_frame), probe_box, FALSE, FALSE, 3);
 	ge->eta.read_bw = new_info_entry_in_frame_rgb(probe_box, "Read BW", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
-	ge->eta.read_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
+	ge->eta.read_iops = new_info_entry_in_frame_rgb(probe_box, "Read IOPS", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
 	ge->eta.write_bw = new_info_entry_in_frame_rgb(probe_box, "Write BW", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
-	ge->eta.write_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
+	ge->eta.write_iops = new_info_entry_in_frame_rgb(probe_box, "Write IOPS", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
 	ge->eta.trim_bw = new_info_entry_in_frame_rgb(probe_box, "Trim BW", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
-	ge->eta.trim_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
+	ge->eta.trim_iops = new_info_entry_in_frame_rgb(probe_box, "Trim IOPS", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
 
 	/*
 	 * Only add this if we have a commit rate
diff -Nru fio-2.16/.gitignore fio-3.1/.gitignore
--- fio-2.16/.gitignore	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/.gitignore	2017-09-28 10:23:20.000000000 +0000
@@ -10,3 +10,4 @@
 y.tab.*
 lex.yy.c
 *.un~
+doc/output
diff -Nru fio-2.16/goptions.c fio-3.1/goptions.c
--- fio-2.16/goptions.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/goptions.c	2017-09-28 10:23:20.000000000 +0000
@@ -826,7 +826,7 @@
 				     unsigned long long *p, unsigned int idx)
 {
 	struct gopt_str_val *g;
-	const gchar *postfix[] = { "B", "KB", "MB", "GB", "PB", "TB", "" };
+	const gchar *postfix[] = { "B", "KiB", "MiB", "GiB", "PiB", "PiB", "" };
 	GtkWidget *label;
 	int i;
 
diff -Nru fio-2.16/helper_thread.c fio-3.1/helper_thread.c
--- fio-2.16/helper_thread.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/helper_thread.c	2017-09-28 10:23:20.000000000 +0000
@@ -71,45 +71,45 @@
 {
 	struct helper_data *hd = data;
 	unsigned int msec_to_next_event, next_log, next_ss = STEADYSTATE_MSEC;
-	struct timeval tv, last_du, last_ss;
+	struct timeval tv;
+	struct timespec ts, last_du, last_ss;
 	int ret = 0;
 
 	sk_out_assign(hd->sk_out);
 
 	gettimeofday(&tv, NULL);
-	memcpy(&last_du, &tv, sizeof(tv));
-	memcpy(&last_ss, &tv, sizeof(tv));
+	ts.tv_sec = tv.tv_sec;
+	ts.tv_nsec = tv.tv_usec * 1000;
+	memcpy(&last_du, &ts, sizeof(ts));
+	memcpy(&last_ss, &ts, sizeof(ts));
 
 	fio_mutex_up(hd->startup_mutex);
 
 	msec_to_next_event = DISK_UTIL_MSEC;
 	while (!ret && !hd->exit) {
-		struct timespec ts;
-		struct timeval now;
 		uint64_t since_du, since_ss = 0;
 
-		timeval_add_msec(&tv, msec_to_next_event);
-		ts.tv_sec = tv.tv_sec;
-		ts.tv_nsec = tv.tv_usec * 1000;
+		timespec_add_msec(&ts, msec_to_next_event);
 
 		pthread_mutex_lock(&hd->lock);
 		pthread_cond_timedwait(&hd->cond, &hd->lock, &ts);
 
-		gettimeofday(&now, NULL);
+		gettimeofday(&tv, NULL);
+		ts.tv_sec = tv.tv_sec;
+		ts.tv_nsec = tv.tv_usec * 1000;
 
 		if (hd->reset) {
-			memcpy(&tv, &now, sizeof(tv));
-			memcpy(&last_du, &now, sizeof(last_du));
-			memcpy(&last_ss, &now, sizeof(last_ss));
+			memcpy(&last_du, &ts, sizeof(ts));
+			memcpy(&last_ss, &ts, sizeof(ts));
 			hd->reset = 0;
 		}
 
 		pthread_mutex_unlock(&hd->lock);
 
-		since_du = mtime_since(&last_du, &now);
+		since_du = mtime_since(&last_du, &ts);
 		if (since_du >= DISK_UTIL_MSEC || DISK_UTIL_MSEC - since_du < 10) {
 			ret = update_io_ticks();
-			timeval_add_msec(&last_du, DISK_UTIL_MSEC);
+			timespec_add_msec(&last_du, DISK_UTIL_MSEC);
 			msec_to_next_event = DISK_UTIL_MSEC;
 			if (since_du >= DISK_UTIL_MSEC)
 				msec_to_next_event -= (since_du - DISK_UTIL_MSEC);
@@ -126,10 +126,10 @@
 			next_log = DISK_UTIL_MSEC;
 
 		if (steadystate_enabled) {
-			since_ss = mtime_since(&last_ss, &now);
+			since_ss = mtime_since(&last_ss, &ts);
 			if (since_ss >= STEADYSTATE_MSEC || STEADYSTATE_MSEC - since_ss < 10) {
 				steadystate_check();
-				timeval_add_msec(&last_ss, since_ss);
+				timespec_add_msec(&last_ss, since_ss);
 				if (since_ss > STEADYSTATE_MSEC)
 					next_ss = STEADYSTATE_MSEC - (since_ss - STEADYSTATE_MSEC);
 				else
diff -Nru fio-2.16/HOWTO fio-3.1/HOWTO
--- fio-2.16/HOWTO	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/HOWTO	2017-09-28 10:23:20.000000000 +0000
@@ -1,2415 +1,3718 @@
-Table of contents
------------------
+How fio works
+-------------
+
+The first step in getting fio to simulate a desired I/O workload, is writing a
+job file describing that specific setup. A job file may contain any number of
+threads and/or files -- the typical contents of the job file is a *global*
+section defining shared parameters, and one or more job sections describing the
+jobs involved. When run, fio parses this file and sets everything up as
+described. If we break down a job from top to bottom, it contains the following
+basic parameters:
+
+`I/O type`_
+
+		Defines the I/O pattern issued to the file(s).  We may only be reading
+		sequentially from this file(s), or we may be writing randomly. Or even
+		mixing reads and writes, sequentially or randomly.
+		Should we be doing buffered I/O, or direct/raw I/O?
+
+`Block size`_
+
+		In how large chunks are we issuing I/O? This may be a single value,
+		or it may describe a range of block sizes.
+
+`I/O size`_
+
+		How much data are we going to be reading/writing.
+
+`I/O engine`_
+
+		How do we issue I/O? We could be memory mapping the file, we could be
+		using regular read/write, we could be using splice, async I/O, or even
+		SG (SCSI generic sg).
+
+`I/O depth`_
+
+		If the I/O engine is async, how large a queuing depth do we want to
+		maintain?
+
+
+`Target file/device`_
+
+		How many files are we spreading the workload over.
+
+`Threads, processes and job synchronization`_
+
+		How many threads or processes should we spread this workload over.
+
+The above are the basic parameters defined for a workload, in addition there's a
+multitude of parameters that modify other aspects of how this job behaves.
+
+
+Command line options
+--------------------
+
+.. option:: --debug=type
+
+	Enable verbose tracing `type` of various fio actions.  May be ``all`` for all types
+	or individual types separated by a comma (e.g. ``--debug=file,mem`` will
+	enable file and memory debugging).  Currently, additional logging is
+	available for:
+
+	*process*
+			Dump info related to processes.
+	*file*
+			Dump info related to file actions.
+	*io*
+			Dump info related to I/O queuing.
+	*mem*
+			Dump info related to memory allocations.
+	*blktrace*
+			Dump info related to blktrace setup.
+	*verify*
+			Dump info related to I/O verification.
+	*all*
+			Enable all debug options.
+	*random*
+			Dump info related to random offset generation.
+	*parse*
+			Dump info related to option matching and parsing.
+	*diskutil*
+			Dump info related to disk utilization updates.
+	*job:x*
+			Dump info only related to job number x.
+	*mutex*
+			Dump info only related to mutex up/down ops.
+	*profile*
+			Dump info related to profile extensions.
+	*time*
+			Dump info related to internal time keeping.
+	*net*
+			Dump info related to networking connections.
+	*rate*
+			Dump info related to I/O rate switching.
+	*compress*
+			Dump info related to log compress/decompress.
+	*?* or *help*
+			Show available debug options.
+
+.. option:: --parse-only
+
+	Parse options only, don't start any I/O.
+
+.. option:: --output=filename
+
+	Write output to file `filename`.
+
+.. option:: --output-format=format
+
+	Set the reporting `format` to `normal`, `terse`, `json`, or `json+`.  Multiple
+	formats can be selected, separated by a comma.  `terse` is a CSV based
+	format.  `json+` is like `json`, except it adds a full dump of the latency
+	buckets.
+
+.. option:: --bandwidth-log
+
+	Generate aggregate bandwidth logs.
+
+.. option:: --minimal
+
+	Print statistics in a terse, semicolon-delimited format.
+
+.. option:: --append-terse
+
+	Print statistics in selected mode AND terse, semicolon-delimited format.
+	**Deprecated**, use :option:`--output-format` instead to select multiple
+	formats.
+
+.. option:: --terse-version=version
+
+	Set terse `version` output format (default 3, or 2 or 4 or 5).
+
+.. option:: --version
+
+	Print version information and exit.
+
+.. option:: --help
+
+	Print a summary of the command line options and exit.
+
+.. option:: --cpuclock-test
+
+	Perform test and validation of internal CPU clock.
+
+.. option:: --crctest=[test]
+
+	Test the speed of the built-in checksumming functions. If no argument is
+	given, all of them are tested. Alternatively, a comma separated list can
+	be passed, in which case the given ones are tested.
+
+.. option:: --cmdhelp=command
+
+	Print help information for `command`. May be ``all`` for all commands.
+
+.. option:: --enghelp=[ioengine[,command]]
+
+	List all commands defined by `ioengine`, or print help for `command`
+	defined by `ioengine`.  If no `ioengine` is given, list all
+	available ioengines.
+
+.. option:: --showcmd=jobfile
+
+	Convert `jobfile` to a set of command-line options.
+
+.. option:: --readonly
+
+	Turn on safety read-only checks, preventing writes.  The ``--readonly``
+	option is an extra safety guard to prevent users from accidentally starting
+	a write workload when that is not desired.  Fio will only write if
+	`rw=write/randwrite/rw/randrw` is given.  This extra safety net can be used
+	as an extra precaution as ``--readonly`` will also enable a write check in
+	the I/O engine core to prevent writes due to unknown user space bug(s).
+
+.. option:: --eta=when
+
+	Specifies when real-time ETA estimate should be printed.  `when` may be
+	`always`, `never` or `auto`.
+
+.. option:: --eta-newline=time
+
+	Force a new line for every `time` period passed.  When the unit is omitted,
+	the value is interpreted in seconds.
+
+.. option:: --status-interval=time
+
+	Force a full status dump of cumulative (from job start) values at `time`
+	intervals. This option does *not* provide per-period measurements. So
+	values such as bandwidth are running averages. When the time unit is omitted,
+	`time` is interpreted in seconds.
+
+.. option:: --section=name
+
+	Only run specified section `name` in job file.  Multiple sections can be specified.
+	The ``--section`` option allows one to combine related jobs into one file.
+	E.g. one job file could define light, moderate, and heavy sections. Tell
+	fio to run only the "heavy" section by giving ``--section=heavy``
+	command line option.  One can also specify the "write" operations in one
+	section and "verify" operation in another section.  The ``--section`` option
+	only applies to job sections.  The reserved *global* section is always
+	parsed and used.
+
+.. option:: --alloc-size=kb
+
+	Set the internal smalloc pool size to `kb` in KiB.  The
+	``--alloc-size`` switch allows one to use a larger pool size for smalloc.
+	If running large jobs with randommap enabled, fio can run out of memory.
+	Smalloc is an internal allocator for shared structures from a fixed size
+	memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
+
+	NOTE: While running :file:`.fio_smalloc.*` backing store files are visible
+	in :file:`/tmp`.
+
+.. option:: --warnings-fatal
+
+	All fio parser warnings are fatal, causing fio to exit with an
+	error.
+
+.. option:: --max-jobs=nr
+
+	Set the maximum number of threads/processes to support to `nr`.
+
+.. option:: --server=args
+
+	Start a backend server, with `args` specifying what to listen to.
+	See `Client/Server`_ section.
+
+.. option:: --daemonize=pidfile
+
+	Background a fio server, writing the pid to the given `pidfile` file.
+
+.. option:: --client=hostname
+
+	Instead of running the jobs locally, send and run them on the given `hostname`
+	or set of `hostname`s.  See `Client/Server`_ section.
+
+.. option:: --remote-config=file
+
+	Tell fio server to load this local `file`.
+
+.. option:: --idle-prof=option
+
+	Report CPU idleness. `option` is one of the following:
+
+		**calibrate**
+			Run unit work calibration only and exit.
+
+		**system**
+			Show aggregate system idleness and unit work.
+
+		**percpu**
+			As **system** but also show per CPU idleness.
+
+.. option:: --inflate-log=log
+
+	Inflate and output compressed `log`.
+
+.. option:: --trigger-file=file
+
+	Execute trigger command when `file` exists.
+
+.. option:: --trigger-timeout=time
+
+	Execute trigger at this `time`.
+
+.. option:: --trigger=command
+
+	Set this `command` as local trigger.
+
+.. option:: --trigger-remote=command
+
+	Set this `command` as remote trigger.
+
+.. option:: --aux-path=path
+
+	Use this `path` for fio state generated files.
+
+Any parameters following the options will be assumed to be job files, unless
+they match a job file parameter. Multiple job files can be listed and each job
+file will be regarded as a separate group. Fio will :option:`stonewall`
+execution between each group.
+
+
+Job file format
+---------------
+
+As previously described, fio accepts one or more job files describing what it is
+supposed to do. The job file format is the classic ini file, where the names
+enclosed in [] brackets define the job name. You are free to use any ASCII name
+you want, except *global* which has special meaning.  Following the job name is
+a sequence of zero or more parameters, one per line, that define the behavior of
+the job. If the first character in a line is a ';' or a '#', the entire line is
+discarded as a comment.
+
+A *global* section sets defaults for the jobs described in that file. A job may
+override a *global* section parameter, and a job file may even have several
+*global* sections if so desired. A job is only affected by a *global* section
+residing above it.
+
+The :option:`--cmdhelp` option also lists all options. If used with a `command`
+argument, :option:`--cmdhelp` will detail the given `command`.
+
+See the `examples/` directory for inspiration on how to write job files.  Note
+the copyright and license requirements currently apply to `examples/` files.
+
+So let's look at a really simple job file that defines two processes, each
+randomly reading from a 128MiB file:
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [global]
+    rw=randread
+    size=128m
+
+    [job1]
+
+    [job2]
+
+    ; -- end job file --
+
+As you can see, the job file sections themselves are empty as all the described
+parameters are shared. As no :option:`filename` option is given, fio makes up a
+`filename` for each of the jobs as it sees fit. On the command line, this job
+would look as follows::
+
+$ fio --name=global --rw=randread --size=128m --name=job1 --name=job2
+
+
+Let's look at an example that has a number of processes writing randomly to
+files:
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    ioengine=libaio
+    iodepth=4
+    rw=randwrite
+    bs=32k
+    direct=0
+    size=64m
+    numjobs=4
+    ; -- end job file --
+
+Here we have no *global* section, as we only have one job defined anyway.  We
+want to use async I/O here, with a depth of 4 for each file. We also increased
+the buffer size used to 32KiB and define numjobs to 4 to fork 4 identical
+jobs. The result is 4 processes each randomly writing to their own 64MiB
+file. Instead of using the above job file, you could have given the parameters
+on the command line. For this case, you would specify::
+
+$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
+
+When fio is utilized as a basis of any reasonably large test suite, it might be
+desirable to share a set of standardized settings across multiple job files.
+Instead of copy/pasting such settings, any section may pull in an external
+:file:`filename.fio` file with *include filename* directive, as in the following
+example::
+
+    ; -- start job file including.fio --
+    [global]
+    filename=/tmp/test
+    filesize=1m
+    include glob-include.fio
+
+    [test]
+    rw=randread
+    bs=4k
+    time_based=1
+    runtime=10
+    include test-include.fio
+    ; -- end job file including.fio --
+
+.. code-block:: ini
+
+    ; -- start job file glob-include.fio --
+    thread=1
+    group_reporting=1
+    ; -- end job file glob-include.fio --
+
+.. code-block:: ini
+
+    ; -- start job file test-include.fio --
+    ioengine=libaio
+    iodepth=4
+    ; -- end job file test-include.fio --
+
+Settings pulled into a section apply to that section only (except *global*
+section). Include directives may be nested in that any included file may contain
+further include directive(s). Include files may not contain [] sections.
+
+
+Environment variables
+~~~~~~~~~~~~~~~~~~~~~
+
+Fio also supports environment variable expansion in job files. Any sub-string of
+the form ``${VARNAME}`` as part of an option value (in other words, on the right
+of the '='), will be expanded to the value of the environment variable called
+`VARNAME`.  If no such environment variable is defined, or `VARNAME` is the
+empty string, the empty string will be substituted.
+
+As an example, let's look at a sample fio invocation and job file::
+
+$ SIZE=64m NUMJOBS=4 fio jobfile.fio
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    rw=randwrite
+    size=${SIZE}
+    numjobs=${NUMJOBS}
+    ; -- end job file --
+
+This will expand to the following equivalent job file at runtime:
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    rw=randwrite
+    size=64m
+    numjobs=4
+    ; -- end job file --
+
+Fio ships with a few example job files, you can also look there for inspiration.
+
+Reserved keywords
+~~~~~~~~~~~~~~~~~
+
+Additionally, fio has a set of reserved keywords that will be replaced
+internally with the appropriate value. Those keywords are:
+
+**$pagesize**
+
+	The architecture page size of the running system.
+
+**$mb_memory**
+
+	Megabytes of total memory in the system.
+
+**$ncpus**
+
+	Number of online available CPUs.
+
+These can be used on the command line or in the job file, and will be
+automatically substituted with the current system values when the job is
+run. Simple math is also supported on these keywords, so you can perform actions
+like::
+
+	size=8*$mb_memory
+
+and get that properly expanded to 8 times the size of memory in the machine.
+
+
+Job file parameters
+-------------------
+
+This section describes in details each parameter associated with a job.  Some
+parameters take an option of a given type, such as an integer or a
+string. Anywhere a numeric value is required, an arithmetic expression may be
+used, provided it is surrounded by parentheses. Supported operators are:
+
+	- addition (+)
+	- subtraction (-)
+	- multiplication (*)
+	- division (/)
+	- modulus (%)
+	- exponentiation (^)
+
+For time values in expressions, units are microseconds by default. This is
+different than for time values not in expressions (not enclosed in
+parentheses). The following types are used:
+
+
+Parameter types
+~~~~~~~~~~~~~~~
+
+**str**
+	String: A sequence of alphanumeric characters.
+
+**time**
+	Integer with possible time suffix.  Without a unit value is interpreted as
+	seconds unless otherwise specified.  Accepts a suffix of 'd' for days, 'h' for
+	hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and
+	'us' (or 'usec') for microseconds.  For example, use 10m for 10 minutes.
+
+.. _int:
+
+**int**
+	Integer. A whole number value, which may contain an integer prefix
+	and an integer suffix:
+
+	[*integer prefix*] **number** [*integer suffix*]
+
+	The optional *integer prefix* specifies the number's base. The default
+	is decimal. *0x* specifies hexadecimal.
+
+	The optional *integer suffix* specifies the number's units, and includes an
+	optional unit prefix and an optional unit.  For quantities of data, the
+	default unit is bytes. For quantities of time, the default unit is seconds
+	unless otherwise specified.
+
+	With :option:`kb_base`\=1000, fio follows international standards for unit
+	prefixes.  To specify power-of-10 decimal values defined in the
+	International System of Units (SI):
+
+		* *K* -- means kilo (K) or 1000
+		* *M* -- means mega (M) or 1000**2
+		* *G* -- means giga (G) or 1000**3
+		* *T* -- means tera (T) or 1000**4
+		* *P* -- means peta (P) or 1000**5
+
+	To specify power-of-2 binary values defined in IEC 80000-13:
+
+		* *Ki* -- means kibi (Ki) or 1024
+		* *Mi* -- means mebi (Mi) or 1024**2
+		* *Gi* -- means gibi (Gi) or 1024**3
+		* *Ti* -- means tebi (Ti) or 1024**4
+		* *Pi* -- means pebi (Pi) or 1024**5
+
+	With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite
+	from those specified in the SI and IEC 80000-13 standards to provide
+	compatibility with old scripts.  For example, 4k means 4096.
+
+	For quantities of data, an optional unit of 'B' may be included
+	(e.g., 'kB' is the same as 'k').
+
+	The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
+	not milli). 'b' and 'B' both mean byte, not bit.
+
+	Examples with :option:`kb_base`\=1000:
+
+		* *4 KiB*: 4096, 4096b, 4096B, 4ki, 4kib, 4kiB, 4Ki, 4KiB
+		* *1 MiB*: 1048576, 1mi, 1024ki
+		* *1 MB*: 1000000, 1m, 1000k
+		* *1 TiB*: 1099511627776, 1ti, 1024gi, 1048576mi
+		* *1 TB*: 1000000000, 1t, 1000m, 1000000k
+
+	Examples with :option:`kb_base`\=1024 (default):
+
+		* *4 KiB*: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+		* *1 MiB*: 1048576, 1m, 1024k
+		* *1 MB*: 1000000, 1mi, 1000ki
+		* *1 TiB*: 1099511627776, 1t, 1024g, 1048576m
+		* *1 TB*: 1000000000, 1ti, 1000mi, 1000000ki
+
+	To specify times (units are not case sensitive):
+
+		* *D* -- means days
+		* *H* -- means hours
+		* *M* -- means minutes
+		* *s* -- or sec means seconds (default)
+		* *ms* -- or *msec* means milliseconds
+		* *us* -- or *usec* means microseconds
+
+	If the option accepts an upper and lower range, use a colon ':' or
+	minus '-' to separate such values. See :ref:`irange <irange>`.
+	If the lower value specified happens to be larger than the upper value
+	the two values are swapped.
+
+.. _bool:
+
+**bool**
+	Boolean. Usually parsed as an integer, however only defined for
+	true and false (1 and 0).
+
+.. _irange:
+
+**irange**
+	Integer range with suffix. Allows value range to be given, such as
+	1024-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
+	option allows two sets of ranges, they can be specified with a ',' or '/'
+	delimiter: 1k-4k/8k-32k. Also see :ref:`int <int>`.
+
+**float_list**
+	A list of floating point numbers, separated by a ':' character.
+
+With the above in mind, here follows the complete list of fio job parameters.
+
+
+Units
+~~~~~
+
+.. option:: kb_base=int
+
+	Select the interpretation of unit prefixes in input parameters.
+
+		**1000**
+			Inputs comply with IEC 80000-13 and the International
+			System of Units (SI). Use:
+
+				- power-of-2 values with IEC prefixes (e.g., KiB)
+				- power-of-10 values with SI prefixes (e.g., kB)
+
+		**1024**
+			Compatibility mode (default).  To avoid breaking old scripts:
+
+				- power-of-2 values with SI prefixes
+				- power-of-10 values with IEC prefixes
+
+	See :option:`bs` for more details on input parameters.
+
+	Outputs always use correct prefixes.  Most outputs include both
+	side-by-side, like::
+
+		bw=2383.3kB/s (2327.4KiB/s)
+
+	If only one value is reported, then kb_base selects the one to use:
+
+		**1000** -- SI prefixes
+
+		**1024** -- IEC prefixes
+
+.. option:: unit_base=int
+
+	Base unit for reporting.  Allowed values are:
+
+	**0**
+		Use auto-detection (default).
+	**8**
+		Byte based.
+	**1**
+		Bit based.
+
+
+Job description
+~~~~~~~~~~~~~~~
+
+.. option:: name=str
+
+	ASCII name of the job. This may be used to override the name printed by fio
+	for this job. Otherwise the job name is used. On the command line this
+	parameter has the special purpose of also signaling the start of a new job.
+
+.. option:: description=str
+
+	Text description of the job. Doesn't do anything except dump this text
+	description when this job is run. It's not parsed.
+
+.. option:: loops=int
+
+	Run the specified number of iterations of this job. Used to repeat the same
+	workload a given number of times. Defaults to 1.
+
+.. option:: numjobs=int
+
+	Create the specified number of clones of this job. Each clone of job
+	is spawned as an independent thread or process. May be used to setup a
+	larger number of threads/processes doing the same thing. Each thread is
+	reported separately; to see statistics for all clones as a whole, use
+	:option:`group_reporting` in conjunction with :option:`new_group`.
+	See :option:`--max-jobs`.  Default: 1.
+
+
+Time related parameters
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: runtime=time
+
+	Tell fio to terminate processing after the specified period of time.  It
+	can be quite hard to determine for how long a specified job will run, so
+	this parameter is handy to cap the total runtime to a given time.  When
+	the unit is omitted, the value is intepreted in seconds.
+
+.. option:: time_based
+
+	If set, fio will run for the duration of the :option:`runtime` specified
+	even if the file(s) are completely read or written. It will simply loop over
+	the same workload as many times as the :option:`runtime` allows.
+
+.. option:: startdelay=irange(time)
+
+	Delay the start of job for the specified amount of time.  Can be a single
+	value or a range.  When given as a range, each thread will choose a value
+	randomly from within the range.  Value is in seconds if a unit is omitted.
+
+.. option:: ramp_time=time
+
+	If set, fio will run the specified workload for this amount of time before
+	logging any performance numbers. Useful for letting performance settle
+	before logging results, thus minimizing the runtime required for stable
+	results. Note that the ``ramp_time`` is considered lead in time for a job,
+	thus it will increase the total runtime if a special timeout or
+	:option:`runtime` is specified.  When the unit is omitted, the value is
+	given in seconds.
+
+.. option:: clocksource=str
+
+	Use the given clocksource as the base of timing. The supported options are:
+
+		**gettimeofday**
+			:manpage:`gettimeofday(2)`
+
+		**clock_gettime**
+			:manpage:`clock_gettime(2)`
+
+		**cpu**
+			Internal CPU clock source
+
+	cpu is the preferred clocksource if it is reliable, as it is very fast (and
+	fio is heavy on time calls). Fio will automatically use this clocksource if
+	it's supported and considered reliable on the system it is running on,
+	unless another clocksource is specifically set. For x86/x86-64 CPUs, this
+	means supporting TSC Invariant.
+
+.. option:: gtod_reduce=bool
+
+	Enable all of the :manpage:`gettimeofday(2)` reducing options
+	(:option:`disable_clat`, :option:`disable_slat`, :option:`disable_bw_measurement`) plus
+	reduce precision of the timeout somewhat to really shrink the
+	:manpage:`gettimeofday(2)` call count. With this option enabled, we only do
+	about 0.4% of the :manpage:`gettimeofday(2)` calls we would have done if all
+	time keeping was enabled.
+
+.. option:: gtod_cpu=int
+
+	Sometimes it's cheaper to dedicate a single thread of execution to just
+	getting the current time. Fio (and databases, for instance) are very
+	intensive on :manpage:`gettimeofday(2)` calls. With this option, you can set
+	one CPU aside for doing nothing but logging current time to a shared memory
+	location. Then the other threads/processes that run I/O workloads need only
+	copy that segment, instead of entering the kernel with a
+	:manpage:`gettimeofday(2)` call. The CPU set aside for doing these time
+	calls will be excluded from other uses. Fio will manually clear it from the
+	CPU mask of other jobs.
+
+
+Target file/device
+~~~~~~~~~~~~~~~~~~
+
+.. option:: directory=str
+
+	Prefix filenames with this directory. Used to place files in a different
+	location than :file:`./`.  You can specify a number of directories by
+	separating the names with a ':' character. These directories will be
+	assigned equally distributed to job clones created by :option:`numjobs` as
+	long as they are using generated filenames. If specific `filename(s)` are
+	set fio will use the first listed directory, and thereby matching the
+	`filename` semantic which generates a file each clone if not specified, but
+	let all clones use the same if set.
+
+	See the :option:`filename` option for information on how to escape "``:``" and
+	"``\``" characters within the directory path itself.
+
+.. option:: filename=str
+
+	Fio normally makes up a `filename` based on the job name, thread number, and
+	file number (see :option:`filename_format`). If you want to share files
+	between threads in a job or several
+	jobs with fixed file paths, specify a `filename` for each of them to override
+	the default. If the ioengine is file based, you can specify a number of files
+	by separating the names with a ':' colon. So if you wanted a job to open
+	:file:`/dev/sda` and :file:`/dev/sdb` as the two working files, you would use
+	``filename=/dev/sda:/dev/sdb``. This also means that whenever this option is
+	specified, :option:`nrfiles` is ignored. The size of regular files specified
+	by this option will be :option:`size` divided by number of files unless an
+	explicit size is specified by :option:`filesize`.
+
+	Each colon and backslash in the wanted path must be escaped with a ``\``
+	character.  For instance, if the path is :file:`/dev/dsk/foo@3,0:c` then you
+	would use ``filename=/dev/dsk/foo@3,0\:c`` and if the path is
+	:file:`F:\\filename` then you would use ``filename=F\:\\filename``.
+
+	On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for
+	the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc.
+	Note: Windows and FreeBSD prevent write access to areas
+	of the disk containing in-use data (e.g. filesystems).
+
+	The filename "`-`" is a reserved name, meaning *stdin* or *stdout*.  Which
+	of the two depends on the read/write direction set.
+
+.. option:: filename_format=str
+
+	If sharing multiple files between jobs, it is usually necessary to have fio
+	generate the exact names that you want. By default, fio will name a file
+	based on the default file format specification of
+	:file:`jobname.jobnumber.filenumber`. With this option, that can be
+	customized. Fio will recognize and replace the following keywords in this
+	string:
+
+		**$jobname**
+				The name of the worker thread or process.
+		**$jobnum**
+				The incremental number of the worker thread or process.
+		**$filenum**
+				The incremental number of the file for that worker thread or
+				process.
+
+	To have dependent jobs share a set of files, this option can be set to have
+	fio generate filenames that are shared between the two. For instance, if
+	:file:`testfiles.$filenum` is specified, file number 4 for any job will be
+	named :file:`testfiles.4`. The default of :file:`$jobname.$jobnum.$filenum`
+	will be used if no other format specifier is given.
+
+.. option:: unique_filename=bool
+
+	To avoid collisions between networked clients, fio defaults to prefixing any
+	generated filenames (with a directory specified) with the source of the
+	client connecting. To disable this behavior, set this option to 0.
+
+.. option:: opendir=str
+
+	Recursively open any files below directory `str`.
+
+.. option:: lockfile=str
+
+	Fio defaults to not locking any files before it does I/O to them. If a file
+	or file descriptor is shared, fio can serialize I/O to that file to make the
+	end result consistent. This is usual for emulating real workloads that share
+	files. The lock modes are:
+
+		**none**
+			No locking. The default.
+		**exclusive**
+			Only one thread or process may do I/O at a time, excluding all
+			others.
+		**readwrite**
+			Read-write locking on the file. Many readers may
+			access the file at the same time, but writes get exclusive access.
+
+.. option:: nrfiles=int
+
+	Number of files to use for this job. Defaults to 1. The size of files
+	will be :option:`size` divided by this unless explicit size is specified by
+	:option:`filesize`. Files are created for each thread separately, and each
+	file will have a file number within its name by default, as explained in
+	:option:`filename` section.
+
+
+.. option:: openfiles=int
+
+	Number of files to keep open at the same time. Defaults to the same as
+	:option:`nrfiles`, can be set smaller to limit the number simultaneous
+	opens.
+
+.. option:: file_service_type=str
+
+	Defines how fio decides which file from a job to service next. The following
+	types are defined:
+
+		**random**
+			Choose a file at random.
+
+		**roundrobin**
+			Round robin over opened files. This is the default.
+
+		**sequential**
+			Finish one file before moving on to the next. Multiple files can
+			still be open depending on :option:`openfiles`.
+
+		**zipf**
+			Use a *Zipf* distribution to decide what file to access.
+
+		**pareto**
+			Use a *Pareto* distribution to decide what file to access.
+
+		**normal**
+			Use a *Gaussian* (normal) distribution to decide what file to
+			access.
+
+		**gauss**
+			Alias for normal.
+
+	For *random*, *roundrobin*, and *sequential*, a postfix can be appended to
+	tell fio how many I/Os to issue before switching to a new file. For example,
+	specifying ``file_service_type=random:8`` would cause fio to issue
+	8 I/Os before selecting a new file at random. For the non-uniform
+	distributions, a floating point postfix can be given to influence how the
+	distribution is skewed. See :option:`random_distribution` for a description
+	of how that would work.
+
+.. option:: ioscheduler=str
+
+	Attempt to switch the device hosting the file to the specified I/O scheduler
+	before running.
+
+.. option:: create_serialize=bool
+
+	If true, serialize the file creation for the jobs.  This may be handy to
+	avoid interleaving of data files, which may greatly depend on the filesystem
+	used and even the number of processors in the system.  Default: true.
+
+.. option:: create_fsync=bool
+
+	:manpage:`fsync(2)` the data file after creation. This is the default.
+
+.. option:: create_on_open=bool
+
+	If true, don't pre-create files but allow the job's open() to create a file
+	when it's time to do I/O.  Default: false -- pre-create all necessary files
+	when the job starts.
+
+.. option:: create_only=bool
+
+	If true, fio will only run the setup phase of the job.  If files need to be
+	laid out or updated on disk, only that will be done -- the actual job contents
+	are not executed.  Default: false.
+
+.. option:: allow_file_create=bool
+
+	If true, fio is permitted to create files as part of its workload.  If this
+	option is false, then fio will error out if
+	the files it needs to use don't already exist. Default: true.
+
+.. option:: allow_mounted_write=bool
+
+	If this isn't set, fio will abort jobs that are destructive (e.g. that write)
+	to what appears to be a mounted device or partition. This should help catch
+	creating inadvertently destructive tests, not realizing that the test will
+	destroy data on the mounted file system. Note that some platforms don't allow
+	writing against a mounted device regardless of this option. Default: false.
+
+.. option:: pre_read=bool
+
+	If this is given, files will be pre-read into memory before starting the
+	given I/O operation. This will also clear the :option:`invalidate` flag,
+	since it is pointless to pre-read and then drop the cache. This will only
+	work for I/O engines that are seek-able, since they allow you to read the
+	same data multiple times. Thus it will not work on non-seekable I/O engines
+	(e.g. network, splice). Default: false.
+
+.. option:: unlink=bool
+
+	Unlink the job files when done. Not the default, as repeated runs of that
+	job would then waste time recreating the file set again and again. Default:
+	false.
+
+.. option:: unlink_each_loop=bool
+
+	Unlink job files after each iteration or loop.  Default: false.
+
+.. option:: zonesize=int
+
+	Divide a file into zones of the specified size. See :option:`zoneskip`.
+
+.. option:: zonerange=int
+
+	Give size of an I/O zone.  See :option:`zoneskip`.
+
+.. option:: zoneskip=int
+
+	Skip the specified number of bytes when :option:`zonesize` data has been
+	read. The two zone options can be used to only do I/O on zones of a file.
+
+
+I/O type
+~~~~~~~~
+
+.. option:: direct=bool
+
+	If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
+	OpenBSD and ZFS on Solaris don't support direct I/O.  On Windows the synchronous
+	ioengines don't support direct I/O.  Default: false.
+
+.. option:: atomic=bool
+
+	If value is true, attempt to use atomic direct I/O. Atomic writes are
+	guaranteed to be stable once acknowledged by the operating system. Only
+	Linux supports O_ATOMIC right now.
+
+.. option:: buffered=bool
+
+	If value is true, use buffered I/O. This is the opposite of the
+	:option:`direct` option. Defaults to true.
+
+.. option:: readwrite=str, rw=str
+
+	Type of I/O pattern. Accepted values are:
+
+		**read**
+				Sequential reads.
+		**write**
+				Sequential writes.
+		**trim**
+				Sequential trims (Linux block devices only).
+		**randread**
+				Random reads.
+		**randwrite**
+				Random writes.
+		**randtrim**
+				Random trims (Linux block devices only).
+		**rw,readwrite**
+				Sequential mixed reads and writes.
+		**randrw**
+				Random mixed reads and writes.
+		**trimwrite**
+				Sequential trim+write sequences. Blocks will be trimmed first,
+				then the same blocks will be written to.
+
+	Fio defaults to read if the option is not specified.  For the mixed I/O
+	types, the default is to split them 50/50.  For certain types of I/O the
+	result may still be skewed a bit, since the speed may be different.
+
+	It is possible to specify the number of I/Os to do before getting a new
+	offset by appending ``:<nr>`` to the end of the string given.  For a
+	random read, it would look like ``rw=randread:8`` for passing in an offset
+	modifier with a value of 8. If the suffix is used with a sequential I/O
+	pattern, then the *<nr>* value specified will be **added** to the generated
+	offset for each I/O turning sequential I/O into sequential I/O with holes.
+	For instance, using ``rw=write:4k`` will skip 4k for every write.  Also see
+	the :option:`rw_sequencer` option.
+
+.. option:: rw_sequencer=str
+
+	If an offset modifier is given by appending a number to the ``rw=<str>``
+	line, then this option controls how that number modifies the I/O offset
+	being generated. Accepted values are:
+
+		**sequential**
+			Generate sequential offset.
+		**identical**
+			Generate the same offset.
+
+	``sequential`` is only useful for random I/O, where fio would normally
+	generate a new random offset for every I/O. If you append e.g. 8 to randread,
+	you would get a new random offset for every 8 I/Os. The result would be a
+	seek for only every 8 I/Os, instead of for every I/O. Use ``rw=randread:8``
+	to specify that. As sequential I/O is already sequential, setting
+	``sequential`` for that would not result in any differences.  ``identical``
+	behaves in a similar fashion, except it sends the same offset 8 number of
+	times before generating a new offset.
+
+.. option:: unified_rw_reporting=bool
+
+	Fio normally reports statistics on a per data direction basis, meaning that
+	reads, writes, and trims are accounted and reported separately. If this
+	option is set fio sums the results and report them as "mixed" instead.
+
+.. option:: randrepeat=bool
+
+	Seed the random number generator used for random I/O patterns in a
+	predictable way so the pattern is repeatable across runs. Default: true.
+
+.. option:: allrandrepeat=bool
+
+	Seed all random number generators in a predictable way so results are
+	repeatable across runs.  Default: false.
+
+.. option:: randseed=int
+
+	Seed the random number generators based on this seed value, to be able to
+	control what sequence of output is being generated.  If not set, the random
+	sequence depends on the :option:`randrepeat` setting.
+
+.. option:: fallocate=str
+
+	Whether pre-allocation is performed when laying down files.
+	Accepted values are:
+
+		**none**
+			Do not pre-allocate space.
+
+		**native**
+			Use a platform's native pre-allocation call but fall back to
+			**none** behavior if it fails/is not implemented.
+
+		**posix**
+			Pre-allocate via :manpage:`posix_fallocate(3)`.
+
+		**keep**
+			Pre-allocate via :manpage:`fallocate(2)` with
+			FALLOC_FL_KEEP_SIZE set.
+
+		**0**
+			Backward-compatible alias for **none**.
+
+		**1**
+			Backward-compatible alias for **posix**.
+
+	May not be available on all supported platforms. **keep** is only available
+	on Linux. If using ZFS on Solaris this cannot be set to **posix**
+	because ZFS doesn't support pre-allocation. Default: **native** if any
+	pre-allocation methods are available, **none** if not.
+
+.. option:: fadvise_hint=str
+
+	Use :manpage:`posix_fadvise(2)` to advise the kernel on what I/O patterns
+	are likely to be issued.  Accepted values are:
+
+		**0**
+			Backwards-compatible hint for "no hint".
+
+		**1**
+			Backwards compatible hint for "advise with fio workload type". This
+			uses **FADV_RANDOM** for a random workload, and **FADV_SEQUENTIAL**
+			for a sequential workload.
+
+		**sequential**
+			Advise using **FADV_SEQUENTIAL**.
+
+		**random**
+			Advise using **FADV_RANDOM**.
+
+.. option:: write_hint=str
+
+	Use :manpage:`fcntl(2)` to advise the kernel what life time to expect
+	from a write. Only supported on Linux, as of version 4.13. Accepted
+	values are:
+
+		**none**
+			No particular life time associated with this file.
+
+		**short**
+			Data written to this file has a short life time.
+
+		**medium**
+			Data written to this file has a medium life time.
+
+		**long**
+			Data written to this file has a long life time.
+
+		**extreme**
+			Data written to this file has a very long life time.
+
+	The values are all relative to each other, and no absolute meaning
+	should be associated with them.
+
+.. option:: offset=int
+
+	Start I/O at the provided offset in the file, given as either a fixed size in
+	bytes or a percentage. If a percentage is given, the next ``blockalign``-ed
+	offset will be used. Data before the given offset will not be touched. This
+	effectively caps the file size at `real_size - offset`. Can be combined with
+	:option:`size` to constrain the start and end range of the I/O workload.
+	A percentage can be specified by a number between 1 and 100 followed by '%',
+	for example, ``offset=20%`` to specify 20%.
+
+.. option:: offset_increment=int
+
+	If this is provided, then the real offset becomes `offset + offset_increment
+	* thread_number`, where the thread number is a counter that starts at 0 and
+	is incremented for each sub-job (i.e. when :option:`numjobs` option is
+	specified). This option is useful if there are several jobs which are
+	intended to operate on a file in parallel disjoint segments, with even
+	spacing between the starting points.
+
+.. option:: number_ios=int
+
+	Fio will normally perform I/Os until it has exhausted the size of the region
+	set by :option:`size`, or if it exhaust the allocated time (or hits an error
+	condition). With this setting, the range/size can be set independently of
+	the number of I/Os to perform. When fio reaches this number, it will exit
+	normally and report status. Note that this does not extend the amount of I/O
+	that will be done, it will only stop fio if this condition is met before
+	other end-of-job criteria.
+
+.. option:: fsync=int
+
+	If writing to a file, issue an :manpage:`fsync(2)` (or its equivalent) of
+	the dirty data for every number of blocks given. For example, if you give 32
+	as a parameter, fio will sync the file after every 32 writes issued. If fio is
+	using non-buffered I/O, we may not sync the file. The exception is the sg
+	I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
+	means fio does not periodically issue and wait for a sync to complete. Also
+	see :option:`end_fsync` and :option:`fsync_on_close`.
+
+.. option:: fdatasync=int
+
+	Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
+	not metadata blocks.  In Windows, FreeBSD, and DragonFlyBSD there is no
+	:manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`.
+	Defaults to 0, which means fio does not periodically issue and wait for a
+	data-only sync to complete.
+
+.. option:: write_barrier=int
+
+	Make every `N-th` write a barrier write.
+
+.. option:: sync_file_range=str:int
+
+	Use :manpage:`sync_file_range(2)` for every `int` number of write
+	operations. Fio will track range of writes that have happened since the last
+	:manpage:`sync_file_range(2)` call. `str` can currently be one or more of:
+
+		**wait_before**
+			SYNC_FILE_RANGE_WAIT_BEFORE
+		**write**
+			SYNC_FILE_RANGE_WRITE
+		**wait_after**
+			SYNC_FILE_RANGE_WAIT_AFTER
+
+	So if you do ``sync_file_range=wait_before,write:8``, fio would use
+	``SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE`` for every 8
+	writes. Also see the :manpage:`sync_file_range(2)` man page.  This option is
+	Linux specific.
+
+.. option:: overwrite=bool
+
+	If true, writes to a file will always overwrite existing data. If the file
+	doesn't already exist, it will be created before the write phase begins. If
+	the file exists and is large enough for the specified write phase, nothing
+	will be done. Default: false.
+
+.. option:: end_fsync=bool
+
+	If true, :manpage:`fsync(2)` file contents when a write stage has completed.
+	Default: false.
+
+.. option:: fsync_on_close=bool
+
+	If true, fio will :manpage:`fsync(2)` a dirty file on close.  This differs
+	from :option:`end_fsync` in that it will happen on every file close, not
+	just at the end of the job.  Default: false.
+
+.. option:: rwmixread=int
+
+	Percentage of a mixed workload that should be reads. Default: 50.
+
+.. option:: rwmixwrite=int
+
+	Percentage of a mixed workload that should be writes. If both
+	:option:`rwmixread` and :option:`rwmixwrite` is given and the values do not
+	add up to 100%, the latter of the two will be used to override the
+	first. This may interfere with a given rate setting, if fio is asked to
+	limit reads or writes to a certain rate.  If that is the case, then the
+	distribution may be skewed. Default: 50.
+
+.. option:: random_distribution=str:float[,str:float][,str:float]
+
+	By default, fio will use a completely uniform random distribution when asked
+	to perform random I/O. Sometimes it is useful to skew the distribution in
+	specific ways, ensuring that some parts of the data is more hot than others.
+	fio includes the following distribution models:
+
+		**random**
+				Uniform random distribution
+
+		**zipf**
+				Zipf distribution
+
+		**pareto**
+				Pareto distribution
+
+		**normal**
+				Normal (Gaussian) distribution
+
+		**zoned**
+				Zoned random distribution
+
+	When using a **zipf** or **pareto** distribution, an input value is also
+	needed to define the access pattern. For **zipf**, this is the `Zipf
+	theta`. For **pareto**, it's the `Pareto power`. Fio includes a test
+	program, :command:`fio-genzipf`, that can be used visualize what the given input
+	values will yield in terms of hit rates.  If you wanted to use **zipf** with
+	a `theta` of 1.2, you would use ``random_distribution=zipf:1.2`` as the
+	option. If a non-uniform model is used, fio will disable use of the random
+	map. For the **normal** distribution, a normal (Gaussian) deviation is
+	supplied as a value between 0 and 100.
+
+	For a **zoned** distribution, fio supports specifying percentages of I/O
+	access that should fall within what range of the file or device. For
+	example, given a criteria of:
+
+		* 60% of accesses should be to the first 10%
+		* 30% of accesses should be to the next 20%
+		* 8% of accesses should be to the next 30%
+		* 2% of accesses should be to the next 40%
+
+	we can define that through zoning of the random accesses. For the above
+	example, the user would do::
+
+		random_distribution=zoned:60/10:30/20:8/30:2/40
+
+	similarly to how :option:`bssplit` works for setting ranges and percentages
+	of block sizes. Like :option:`bssplit`, it's possible to specify separate
+	zones for reads, writes, and trims. If just one set is given, it'll apply to
+	all of them.
+
+.. option:: percentage_random=int[,int][,int]
+
+	For a random workload, set how big a percentage should be random. This
+	defaults to 100%, in which case the workload is fully random. It can be set
+	from anywhere from 0 to 100.  Setting it to 0 would make the workload fully
+	sequential. Any setting in between will result in a random mix of sequential
+	and random I/O, at the given percentages.  Comma-separated values may be
+	specified for reads, writes, and trims as described in :option:`blocksize`.
+
+.. option:: norandommap
+
+	Normally fio will cover every block of the file when doing random I/O. If
+	this option is given, fio will just get a new random offset without looking
+	at past I/O history. This means that some blocks may not be read or written,
+	and that some blocks may be read/written more than once. If this option is
+	used with :option:`verify` and multiple blocksizes (via :option:`bsrange`),
+	only intact blocks are verified, i.e., partially-overwritten blocks are
+	ignored.
+
+.. option:: softrandommap=bool
+
+	See :option:`norandommap`. If fio runs with the random block map enabled and
+	it fails to allocate the map, if this option is set it will continue without
+	a random block map. As coverage will not be as complete as with random maps,
+	this option is disabled by default.
+
+.. option:: random_generator=str
+
+	Fio supports the following engines for generating I/O offsets for random I/O:
+
+		**tausworthe**
+			Strong 2^88 cycle random number generator.
+		**lfsr**
+			Linear feedback shift register generator.
+		**tausworthe64**
+			Strong 64-bit 2^258 cycle random number generator.
+
+	**tausworthe** is a strong random number generator, but it requires tracking
+	on the side if we want to ensure that blocks are only read or written
+	once. **lfsr** guarantees that we never generate the same offset twice, and
+	it's also less computationally expensive. It's not a true random generator,
+	however, though for I/O purposes it's typically good enough. **lfsr** only
+	works with single block sizes, not with workloads that use multiple block
+	sizes. If used with such a workload, fio may read or write some blocks
+	multiple times. The default value is **tausworthe**, unless the required
+	space exceeds 2^32 blocks. If it does, then **tausworthe64** is
+	selected automatically.
+
+
+Block size
+~~~~~~~~~~
+
+.. option:: blocksize=int[,int][,int], bs=int[,int][,int]
+
+	The block size in bytes used for I/O units. Default: 4096.  A single value
+	applies to reads, writes, and trims.  Comma-separated values may be
+	specified for reads, writes, and trims.  A value not terminated in a comma
+	applies to subsequent types.
+
+	Examples:
+
+		**bs=256k**
+			means 256k for reads, writes and trims.
+
+		**bs=8k,32k**
+			means 8k for reads, 32k for writes and trims.
+
+		**bs=8k,32k,**
+			means 8k for reads, 32k for writes, and default for trims.
+
+		**bs=,8k**
+			means default for reads, 8k for writes and trims.
+
+		**bs=,8k,**
+			means default for reads, 8k for writes, and default for trims.
+
+.. option:: blocksize_range=irange[,irange][,irange], bsrange=irange[,irange][,irange]
+
+	A range of block sizes in bytes for I/O units.  The issued I/O unit will
+	always be a multiple of the minimum size, unless
+	:option:`blocksize_unaligned` is set.
+
+	Comma-separated ranges may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
+
+	Example: ``bsrange=1k-4k,2k-8k``.
+
+.. option:: bssplit=str[,str][,str]
+
+	Sometimes you want even finer grained control of the block sizes issued, not
+	just an even split between them.  This option allows you to weight various
+	block sizes, so that you are able to define a specific amount of block sizes
+	issued. The format for this option is::
+
+		bssplit=blocksize/percentage:blocksize/percentage
+
+	for as many block sizes as needed. So if you want to define a workload that
+	has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would write::
+
+		bssplit=4k/10:64k/50:32k/40
+
+	Ordering does not matter. If the percentage is left blank, fio will fill in
+	the remaining values evenly. So a bssplit option like this one::
+
+		bssplit=4k/50:1k/:32k/
+
+	would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always add up
+	to 100, if bssplit is given a range that adds up to more, it will error out.
+
+	Comma-separated values may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
+
+	If you want a workload that has 50% 2k reads and 50% 4k reads, while having
+	90% 4k writes and 10% 8k writes, you would specify::
+
+		bssplit=2k/50:4k/50,4k/90,8k/10
+
+.. option:: blocksize_unaligned, bs_unaligned
+
+	If set, fio will issue I/O units with any size within
+	:option:`blocksize_range`, not just multiples of the minimum size.  This
+	typically won't work with direct I/O, as that normally requires sector
+	alignment.
+
+.. option:: bs_is_seq_rand=bool
+
+	If this option is set, fio will use the normal read,write blocksize settings
+	as sequential,random blocksize settings instead. Any random read or write
+	will use the WRITE blocksize settings, and any sequential read or write will
+	use the READ blocksize settings.
+
+.. option:: blockalign=int[,int][,int], ba=int[,int][,int]
+
+	Boundary to which fio will align random I/O units.  Default:
+	:option:`blocksize`.  Minimum alignment is typically 512b for using direct
+	I/O, though it usually depends on the hardware block size. This option is
+	mutually exclusive with using a random map for files, so it will turn off
+	that option.  Comma-separated values may be specified for reads, writes, and
+	trims as described in :option:`blocksize`.
+
+
+Buffers and memory
+~~~~~~~~~~~~~~~~~~
+
+.. option:: zero_buffers
+
+	Initialize buffers with all zeros. Default: fill buffers with random data.
+
+.. option:: refill_buffers
+
+	If this option is given, fio will refill the I/O buffers on every
+	submit. The default is to only fill it at init time and reuse that
+	data. Only makes sense if zero_buffers isn't specified, naturally. If data
+	verification is enabled, `refill_buffers` is also automatically enabled.
+
+.. option:: scramble_buffers=bool
+
+	If :option:`refill_buffers` is too costly and the target is using data
+	deduplication, then setting this option will slightly modify the I/O buffer
+	contents to defeat normal de-dupe attempts. This is not enough to defeat
+	more clever block compression attempts, but it will stop naive dedupe of
+	blocks. Default: true.
+
+.. option:: buffer_compress_percentage=int
+
+	If this is set, then fio will attempt to provide I/O buffer content (on
+	WRITEs) that compresses to the specified level. Fio does this by providing a
+	mix of random data and a fixed pattern. The fixed pattern is either zeros,
+	or the pattern specified by :option:`buffer_pattern`. If the pattern option
+	is used, it might skew the compression ratio slightly. Note that this is per
+	block size unit, for file/disk wide compression level that matches this
+	setting, you'll also want to set :option:`refill_buffers`.
+
+.. option:: buffer_compress_chunk=int
+
+	See :option:`buffer_compress_percentage`. This setting allows fio to manage
+	how big the ranges of random data and zeroed data is. Without this set, fio
+	will provide :option:`buffer_compress_percentage` of blocksize random data,
+	followed by the remaining zeroed. With this set to some chunk size smaller
+	than the block size, fio can alternate random and zeroed data throughout the
+	I/O buffer.
+
+.. option:: buffer_pattern=str
+
+	If set, fio will fill the I/O buffers with this pattern or with the contents
+	of a file. If not set, the contents of I/O buffers are defined by the other
+	options related to buffer contents. The setting can be any pattern of bytes,
+	and can be prefixed with 0x for hex values. It may also be a string, where
+	the string must then be wrapped with ``""``. Or it may also be a filename,
+	where the filename must be wrapped with ``''`` in which case the file is
+	opened and read. Note that not all the file contents will be read if that
+	would cause the buffers to overflow. So, for example::
+
+		buffer_pattern='filename'
+
+	or::
+
+		buffer_pattern="abcd"
+
+	or::
+
+		buffer_pattern=-12
+
+	or::
+
+		buffer_pattern=0xdeadface
+
+	Also you can combine everything together in any order::
+
+		buffer_pattern=0xdeadface"abcd"-12'filename'
+
+.. option:: dedupe_percentage=int
+
+	If set, fio will generate this percentage of identical buffers when
+	writing. These buffers will be naturally dedupable. The contents of the
+	buffers depend on what other buffer compression settings have been set. It's
+	possible to have the individual buffers either fully compressible, or not at
+	all. This option only controls the distribution of unique buffers.
+
+.. option:: invalidate=bool
+
+	Invalidate the buffer/page cache parts of the files to be used prior to
+	starting I/O if the platform and file type support it.  Defaults to true.
+	This will be ignored if :option:`pre_read` is also specified for the
+	same job.
+
+.. option:: sync=bool
+
+	Use synchronous I/O for buffered writes. For the majority of I/O engines,
+	this means using O_SYNC. Default: false.
+
+.. option:: iomem=str, mem=str
+
+	Fio can use various types of memory as the I/O unit buffer.  The allowed
+	values are:
+
+		**malloc**
+			Use memory from :manpage:`malloc(3)` as the buffers.  Default memory
+			type.
+
+		**shm**
+			Use shared memory as the buffers. Allocated through
+			:manpage:`shmget(2)`.
+
+		**shmhuge**
+			Same as shm, but use huge pages as backing.
+
+		**mmap**
+			Use :manpage:`mmap(2)` to allocate buffers. May either be anonymous memory, or can
+			be file backed if a filename is given after the option. The format
+			is `mem=mmap:/path/to/file`.
+
+		**mmaphuge**
+			Use a memory mapped huge file as the buffer backing. Append filename
+			after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file`.
+
+		**mmapshared**
+			Same as mmap, but use a MMAP_SHARED mapping.
+
+		**cudamalloc**
+			Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+			The :option:`ioengine` must be `rdma`.
+
+	The area allocated is a function of the maximum allowed bs size for the job,
+	multiplied by the I/O depth given. Note that for **shmhuge** and
+	**mmaphuge** to work, the system must have free huge pages allocated. This
+	can normally be checked and set by reading/writing
+	:file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page
+	is 4MiB in size. So to calculate the number of huge pages you need for a
+	given job file, add up the I/O depth of all jobs (normally one unless
+	:option:`iodepth` is used) and multiply by the maximum bs set. Then divide
+	that number by the huge page size. You can see the size of the huge pages in
+	:file:`/proc/meminfo`. If no huge pages are allocated by having a non-zero
+	number in `nr_hugepages`, using **mmaphuge** or **shmhuge** will fail. Also
+	see :option:`hugepage-size`.
+
+	**mmaphuge** also needs to have hugetlbfs mounted and the file location
+	should point there. So if it's mounted in :file:`/huge`, you would use
+	`mem=mmaphuge:/huge/somefile`.
+
+.. option:: iomem_align=int, mem_align=int
+
+	This indicates the memory alignment of the I/O memory buffers.  Note that
+	the given alignment is applied to the first I/O unit buffer, if using
+	:option:`iodepth` the alignment of the following buffers are given by the
+	:option:`bs` used. In other words, if using a :option:`bs` that is a
+	multiple of the page sized in the system, all buffers will be aligned to
+	this value. If using a :option:`bs` that is not page aligned, the alignment
+	of subsequent I/O memory buffers is the sum of the :option:`iomem_align` and
+	:option:`bs` used.
+
+.. option:: hugepage-size=int
+
+	Defines the size of a huge page. Must at least be equal to the system
+	setting, see :file:`/proc/meminfo`. Defaults to 4MiB.  Should probably
+	always be a multiple of megabytes, so using ``hugepage-size=Xm`` is the
+	preferred way to set this to avoid setting a non-pow-2 bad value.
+
+.. option:: lockmem=int
+
+	Pin the specified amount of memory with :manpage:`mlock(2)`. Can be used to
+	simulate a smaller amount of memory.  The amount specified is per worker.
+
+
+I/O size
+~~~~~~~~
+
+.. option:: size=int
+
+	The total size of file I/O for each thread of this job. Fio will run until
+	this many bytes has been transferred, unless runtime is limited by other options
+	(such as :option:`runtime`, for instance, or increased/decreased by :option:`io_size`).
+	Fio will divide this size between the available files determined by options
+	such as :option:`nrfiles`, :option:`filename`, unless :option:`filesize` is
+	specified by the job. If the result of division happens to be 0, the size is
+	set to the physical size of the given files or devices if they exist.
+	If this option is not specified, fio will use the full size of the given
+	files or devices.  If the files do not exist, size must be given. It is also
+	possible to give size as a percentage between 1 and 100. If ``size=20%`` is
+	given, fio will use 20% of the full size of the given files or devices.
+	Can be combined with :option:`offset` to constrain the start and end range
+	that I/O will be done within.
+
+.. option:: io_size=int, io_limit=int
+
+	Normally fio operates within the region set by :option:`size`, which means
+	that the :option:`size` option sets both the region and size of I/O to be
+	performed. Sometimes that is not what you want. With this option, it is
+	possible to define just the amount of I/O that fio should do. For instance,
+	if :option:`size` is set to 20GiB and :option:`io_size` is set to 5GiB, fio
+	will perform I/O within the first 20GiB but exit when 5GiB have been
+	done. The opposite is also possible -- if :option:`size` is set to 20GiB,
+	and :option:`io_size` is set to 40GiB, then fio will do 40GiB of I/O within
+	the 0..20GiB region.
+
+.. option:: filesize=irange(int)
+
+	Individual file sizes. May be a range, in which case fio will select sizes
+	for files at random within the given range and limited to :option:`size` in
+	total (if that is given). If not given, each created file is the same size.
+	This option overrides :option:`size` in terms of file size, which means
+	this value is used as a fixed size or possible range of each file.
+
+.. option:: file_append=bool
+
+	Perform I/O after the end of the file. Normally fio will operate within the
+	size of a file. If this option is set, then fio will append to the file
+	instead. This has identical behavior to setting :option:`offset` to the size
+	of a file.  This option is ignored on non-regular files.
+
+.. option:: fill_device=bool, fill_fs=bool
+
+	Sets size to something really large and waits for ENOSPC (no space left on
+	device) as the terminating condition. Only makes sense with sequential
+	write. For a read workload, the mount point will be filled first then I/O
+	started on the result. This option doesn't make sense if operating on a raw
+	device node, since the size of that is already known by the file system.
+	Additionally, writing beyond end-of-device will not return ENOSPC there.
+
+
+I/O engine
+~~~~~~~~~~
+
+.. option:: ioengine=str
+
+	Defines how the job issues I/O to the file. The following types are defined:
+
+		**sync**
+			Basic :manpage:`read(2)` or :manpage:`write(2)`
+			I/O. :manpage:`lseek(2)` is used to position the I/O location.
+			See :option:`fsync` and :option:`fdatasync` for syncing write I/Os.
+
+		**psync**
+			Basic :manpage:`pread(2)` or :manpage:`pwrite(2)` I/O.  Default on
+			all supported operating systems except for Windows.
+
+		**vsync**
+			Basic :manpage:`readv(2)` or :manpage:`writev(2)` I/O.  Will emulate
+			queuing by coalescing adjacent I/Os into a single submission.
+
+		**pvsync**
+			Basic :manpage:`preadv(2)` or :manpage:`pwritev(2)` I/O.
+
+		**pvsync2**
+			Basic :manpage:`preadv2(2)` or :manpage:`pwritev2(2)` I/O.
+
+		**libaio**
+			Linux native asynchronous I/O. Note that Linux may only support
+			queued behavior with non-buffered I/O (set ``direct=1`` or
+			``buffered=0``).
+			This engine defines engine specific options.
+
+		**posixaio**
+			POSIX asynchronous I/O using :manpage:`aio_read(3)` and
+			:manpage:`aio_write(3)`.
+
+		**solarisaio**
+			Solaris native asynchronous I/O.
+
+		**windowsaio**
+			Windows native asynchronous I/O.  Default on Windows.
+
+		**mmap**
+			File is memory mapped with :manpage:`mmap(2)` and data copied
+			to/from using :manpage:`memcpy(3)`.
+
+		**splice**
+			:manpage:`splice(2)` is used to transfer the data and
+			:manpage:`vmsplice(2)` to transfer data from user space to the
+			kernel.
+
+		**sg**
+			SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
+			ioctl, or if the target is an sg character device we use
+			:manpage:`read(2)` and :manpage:`write(2)` for asynchronous
+			I/O. Requires :option:`filename` option to specify either block or
+			character devices.
+
+		**null**
+			Doesn't transfer any data, just pretends to.  This is mainly used to
+			exercise fio itself and for debugging/testing purposes.
+
+		**net**
+			Transfer over the network to given ``host:port``.  Depending on the
+			:option:`protocol` used, the :option:`hostname`, :option:`port`,
+			:option:`listen` and :option:`filename` options are used to specify
+			what sort of connection to make, while the :option:`protocol` option
+			determines which protocol will be used.  This engine defines engine
+			specific options.
+
+		**netsplice**
+			Like **net**, but uses :manpage:`splice(2)` and
+			:manpage:`vmsplice(2)` to map data and send/receive.
+			This engine defines engine specific options.
+
+		**cpuio**
+			Doesn't transfer any data, but burns CPU cycles according to the
+			:option:`cpuload` and :option:`cpuchunks` options. Setting
+			:option:`cpuload`\=85 will cause that job to do nothing but burn 85%
+			of the CPU. In case of SMP machines, use :option:`numjobs`=<nr_of_cpu>
+			to get desired CPU usage, as the cpuload only loads a
+			single CPU at the desired rate. A job never finishes unless there is
+			at least one non-cpuio job.
+
+		**guasi**
+			The GUASI I/O engine is the Generic Userspace Asyncronous Syscall
+			Interface approach to async I/O. See
+
+			http://www.xmailserver.org/guasi-lib.html
+
+			for more info on GUASI.
+
+		**rdma**
+			The RDMA I/O engine supports both RDMA memory semantics
+			(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
+			InfiniBand, RoCE and iWARP protocols.
+
+		**falloc**
+			I/O engine that does regular fallocate to simulate data transfer as
+			fio ioengine.
+
+			DDIR_READ
+				does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
+
+			DDIR_WRITE
+				does fallocate(,mode = 0).
+
+			DDIR_TRIM
+				does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
+
+		**ftruncate**
+			I/O engine that sends :manpage:`ftruncate(2)` operations in response
+			to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
+			size to the current block offset. :option:`blocksize` is ignored.
+
+		**e4defrag**
+			I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+			defragment activity in request to DDIR_WRITE event.
+
+		**rbd**
+			I/O engine supporting direct access to Ceph Rados Block Devices
+			(RBD) via librbd without the need to use the kernel rbd driver. This
+			ioengine defines engine specific options.
+
+		**gfapi**
+			Using GlusterFS libgfapi sync interface to direct access to
+			GlusterFS volumes without having to go through FUSE.  This ioengine
+			defines engine specific options.
+
+		**gfapi_async**
+			Using GlusterFS libgfapi async interface to direct access to
+			GlusterFS volumes without having to go through FUSE. This ioengine
+			defines engine specific options.
+
+		**libhdfs**
+			Read and write through Hadoop (HDFS).  The :option:`filename` option
+			is used to specify host,port of the hdfs name-node to connect.  This
+			engine interprets offsets a little differently.  In HDFS, files once
+			created cannot be modified so random writes are not possible. To
+			imitate this the libhdfs engine expects a bunch of small files to be
+			created over HDFS and will randomly pick a file from them
+			based on the offset generated by fio backend (see the example
+			job file to create such files, use ``rw=write`` option). Please
+			note, it may be necessary to set environment variables to work
+			with HDFS/libhdfs properly.  Each job uses its own connection to
+			HDFS.
+
+		**mtd**
+			Read, write and erase an MTD character device (e.g.,
+			:file:`/dev/mtd0`). Discards are treated as erases. Depending on the
+			underlying device type, the I/O may have to go in a certain pattern,
+			e.g., on NAND, writing sequentially to erase blocks and discarding
+			before overwriting. The `trimwrite` mode works well for this
+			constraint.
+
+		**pmemblk**
+			Read and write using filesystem DAX to a file on a filesystem
+			mounted with DAX on a persistent memory device through the NVML
+			libpmemblk library.
+
+		**dev-dax**
+			Read and write using device DAX to a persistent memory device (e.g.,
+			/dev/dax0.0) through the NVML libpmem library.
+
+		**external**
+			Prefix to specify loading an external I/O engine object file. Append
+			the engine filename, e.g. ``ioengine=external:/tmp/foo.o`` to load
+			ioengine :file:`foo.o` in :file:`/tmp`. The path can be either
+			absolute or relative. See :file:`engines/skeleton_external.c` for
+			details of writing an external I/O engine.
+
+
+I/O engine specific parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition, there are some parameters which are only valid when a specific
+:option:`ioengine` is in use. These are used identically to normal parameters,
+with the caveat that when used on the command line, they must come after the
+:option:`ioengine` that defines them is selected.
+
+.. option:: userspace_reap : [libaio]
+
+	Normally, with the libaio engine in use, fio will use the
+	:manpage:`io_getevents(2)` system call to reap newly returned events.  With
+	this flag turned on, the AIO ring will be read directly from user-space to
+	reap events. The reaping mode is only enabled when polling for a minimum of
+	0 events (e.g. when :option:`iodepth_batch_complete` `=0`).
+
+.. option:: hipri : [pvsync2]
+
+	Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+	than normal.
+
+.. option:: hipri_percentage : [pvsync2]
+
+	When hipri is set this determines the probability of a pvsync2 I/O being high
+	priority. The default is 100%.
+
+.. option:: cpuload=int : [cpuio]
+
+	Attempt to use the specified percentage of CPU cycles. This is a mandatory
+	option when using cpuio I/O engine.
+
+.. option:: cpuchunks=int : [cpuio]
+
+	Split the load into cycles of the given time. In microseconds.
+
+.. option:: exit_on_io_done=bool : [cpuio]
+
+	Detect when I/O threads are done, then exit.
+
+.. option:: namenode=str : [libhdfs]
+
+	The hostname or IP address of a HDFS cluster namenode to contact.
+
+.. option:: port=int
+
+   [libhdfs]
+
+		The listening port of the HFDS cluster namenode.
+
+   [netsplice], [net]
+
+		The TCP or UDP port to bind to or connect to. If this is used with
+		:option:`numjobs` to spawn multiple instances of the same job type, then
+		this will be the starting port number since fio will use a range of
+		ports.
+
+.. option:: hostname=str : [netsplice] [net]
+
+	The hostname or IP address to use for TCP or UDP based I/O.  If the job is
+	a TCP listener or UDP reader, the hostname is not used and must be omitted
+	unless it is a valid UDP multicast address.
+
+.. option:: interface=str : [netsplice] [net]
+
+	The IP address of the network interface used to send or receive UDP
+	multicast.
+
+.. option:: ttl=int : [netsplice] [net]
+
+	Time-to-live value for outgoing UDP multicast packets. Default: 1.
+
+.. option:: nodelay=bool : [netsplice] [net]
+
+	Set TCP_NODELAY on TCP connections.
+
+.. option:: protocol=str, proto=str : [netsplice] [net]
+
+	The network protocol to use. Accepted values are:
+
+	**tcp**
+		Transmission control protocol.
+	**tcpv6**
+		Transmission control protocol V6.
+	**udp**
+		User datagram protocol.
+	**udpv6**
+		User datagram protocol V6.
+	**unix**
+		UNIX domain socket.
+
+	When the protocol is TCP or UDP, the port must also be given, as well as the
+	hostname if the job is a TCP listener or UDP reader. For unix sockets, the
+	normal :option:`filename` option should be used and the port is invalid.
+
+.. option:: listen : [netsplice] [net]
+
+	For TCP network connections, tell fio to listen for incoming connections
+	rather than initiating an outgoing connection. The :option:`hostname` must
+	be omitted if this option is used.
+
+.. option:: pingpong : [netsplice] [net]
+
+	Normally a network writer will just continue writing data, and a network
+	reader will just consume packages. If ``pingpong=1`` is set, a writer will
+	send its normal payload to the reader, then wait for the reader to send the
+	same payload back. This allows fio to measure network latencies. The
+	submission and completion latencies then measure local time spent sending or
+	receiving, and the completion latency measures how long it took for the
+	other end to receive and send back.  For UDP multicast traffic
+	``pingpong=1`` should only be set for a single reader when multiple readers
+	are listening to the same address.
+
+.. option:: window_size : [netsplice] [net]
+
+	Set the desired socket buffer size for the connection.
+
+.. option:: mss : [netsplice] [net]
+
+	Set the TCP maximum segment size (TCP_MAXSEG).
+
+.. option:: donorname=str : [e4defrag]
+
+	File will be used as a block donor (swap extents between files).
+
+.. option:: inplace=int : [e4defrag]
+
+	Configure donor file blocks allocation strategy:
+
+	**0**
+		Default. Preallocate donor's file on init.
+	**1**
+		Allocate space immediately inside defragment event, and free right
+		after event.
+
+.. option:: clustername=str : [rbd]
+
+	Specifies the name of the Ceph cluster.
+
+.. option:: rbdname=str : [rbd]
+
+	Specifies the name of the RBD.
+
+.. option:: pool=str : [rbd]
+
+	Specifies the name of the Ceph pool containing RBD.
+
+.. option:: clientname=str : [rbd]
+
+	Specifies the username (without the 'client.' prefix) used to access the
+	Ceph cluster. If the *clustername* is specified, the *clientname* shall be
+	the full *type.id* string. If no type. prefix is given, fio will add
+	'client.' by default.
+
+.. option:: skip_bad=bool : [mtd]
+
+	Skip operations against known bad blocks.
+
+.. option:: hdfsdirectory : [libhdfs]
+
+	libhdfs will create chunk in this HDFS directory.
+
+.. option:: chunk_size : [libhdfs]
+
+	The size of the chunk to use for each file.
+
+
+I/O depth
+~~~~~~~~~
+
+.. option:: iodepth=int
+
+	Number of I/O units to keep in flight against the file.  Note that
+	increasing *iodepth* beyond 1 will not affect synchronous ioengines (except
+	for small degrees when :option:`verify_async` is in use).  Even async
+	engines may impose OS restrictions causing the desired depth not to be
+	achieved.  This may happen on Linux when using libaio and not setting
+	:option:`direct`\=1, since buffered I/O is not async on that OS.  Keep an
+	eye on the I/O depth distribution in the fio output to verify that the
+	achieved depth is as expected. Default: 1.
+
+.. option:: iodepth_batch_submit=int, iodepth_batch=int
+
+	This defines how many pieces of I/O to submit at once.  It defaults to 1
+	which means that we submit each I/O as soon as it is available, but can be
+	raised to submit bigger batches of I/O at the time. If it is set to 0 the
+	:option:`iodepth` value will be used.
+
+.. option:: iodepth_batch_complete_min=int, iodepth_batch_complete=int
+
+	This defines how many pieces of I/O to retrieve at once. It defaults to 1
+	which means that we'll ask for a minimum of 1 I/O in the retrieval process
+	from the kernel. The I/O retrieval will go on until we hit the limit set by
+	:option:`iodepth_low`. If this variable is set to 0, then fio will always
+	check for completed events before queuing more I/O. This helps reduce I/O
+	latency, at the cost of more retrieval system calls.
+
+.. option:: iodepth_batch_complete_max=int
+
+	This defines maximum pieces of I/O to retrieve at once. This variable should
+	be used along with :option:`iodepth_batch_complete_min`\=int variable,
+	specifying the range of min and max amount of I/O which should be
+	retrieved. By default it is equal to the :option:`iodepth_batch_complete_min`
+	value.
+
+	Example #1::
+
+		iodepth_batch_complete_min=1
+		iodepth_batch_complete_max=<iodepth>
+
+	which means that we will retrieve at least 1 I/O and up to the whole
+	submitted queue depth. If none of I/O has been completed yet, we will wait.
+
+	Example #2::
+
+		iodepth_batch_complete_min=0
+		iodepth_batch_complete_max=<iodepth>
+
+	which means that we can retrieve up to the whole submitted queue depth, but
+	if none of I/O has been completed yet, we will NOT wait and immediately exit
+	the system call. In this example we simply do polling.
+
+.. option:: iodepth_low=int
+
+	The low water mark indicating when to start filling the queue
+	again. Defaults to the same as :option:`iodepth`, meaning that fio will
+	attempt to keep the queue full at all times.  If :option:`iodepth` is set to
+	e.g. 16 and *iodepth_low* is set to 4, then after fio has filled the queue of
+	16 requests, it will let the depth drain down to 4 before starting to fill
+	it again.
+
+.. option:: serialize_overlap=bool
+
+	Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+	When two or more I/Os are submitted simultaneously, there is no guarantee that
+	the I/Os will be processed or completed in the submitted order. Further, if
+	two or more of those I/Os are writes, any overlapping region between them can
+	become indeterminate/undefined on certain storage. These issues can cause
+	verification to fail erratically when at least one of the racing I/Os is
+	changing data and the overlapping region has a non-zero size. Setting
+	``serialize_overlap`` tells fio to avoid provoking this behavior by explicitly
+	serializing in-flight I/Os that have a non-zero overlap. Note that setting
+	this option can reduce both performance and the `:option:iodepth` achieved.
+	Additionally this option does not work when :option:`io_submit_mode` is set to
+	offload. Default: false.
+
+.. option:: io_submit_mode=str
+
+	This option controls how fio submits the I/O to the I/O engine. The default
+	is `inline`, which means that the fio job threads submit and reap I/O
+	directly. If set to `offload`, the job threads will offload I/O submission
+	to a dedicated pool of I/O threads. This requires some coordination and thus
+	has a bit of extra overhead, especially for lower queue depth I/O where it
+	can increase latencies. The benefit is that fio can manage submission rates
+	independently of the device completion rates. This avoids skewed latency
+	reporting if I/O gets backed up on the device side (the coordinated omission
+	problem).
+
+
+I/O rate
+~~~~~~~~
+
+.. option:: thinktime=time
+
+	Stall the job for the specified period of time after an I/O has completed before issuing the
+	next. May be used to simulate processing being done by an application.
+	When the unit is omitted, the value is interpreted in microseconds.  See
+	:option:`thinktime_blocks` and :option:`thinktime_spin`.
+
+.. option:: thinktime_spin=time
+
+	Only valid if :option:`thinktime` is set - pretend to spend CPU time doing
+	something with the data received, before falling back to sleeping for the
+	rest of the period specified by :option:`thinktime`.  When the unit is
+	omitted, the value is interpreted in microseconds.
+
+.. option:: thinktime_blocks=int
+
+	Only valid if :option:`thinktime` is set - control how many blocks to issue,
+	before waiting :option:`thinktime` usecs. If not set, defaults to 1 which will make
+	fio wait :option:`thinktime` usecs after every block. This effectively makes any
+	queue depth setting redundant, since no more than 1 I/O will be queued
+	before we have to complete it and do our :option:`thinktime`. In other words, this
+	setting effectively caps the queue depth if the latter is larger.
+
+.. option:: rate=int[,int][,int]
+
+	Cap the bandwidth used by this job. The number is in bytes/sec, the normal
+	suffix rules apply.  Comma-separated values may be specified for reads,
+	writes, and trims as described in :option:`blocksize`.
+
+	For example, using `rate=1m,500k` would limit reads to 1MiB/sec and writes to
+	500KiB/sec.  Capping only reads or writes can be done with `rate=,500k` or
+	`rate=500k,` where the former will only limit writes (to 500KiB/sec) and the
+	latter will only limit reads.
+
+.. option:: rate_min=int[,int][,int]
+
+	Tell fio to do whatever it can to maintain at least this bandwidth. Failing
+	to meet this requirement will cause the job to exit.  Comma-separated values
+	may be specified for reads, writes, and trims as described in
+	:option:`blocksize`.
+
+.. option:: rate_iops=int[,int][,int]
+
+	Cap the bandwidth to this number of IOPS. Basically the same as
+	:option:`rate`, just specified independently of bandwidth. If the job is
+	given a block size range instead of a fixed value, the smallest block size
+	is used as the metric.  Comma-separated values may be specified for reads,
+	writes, and trims as described in :option:`blocksize`.
+
+.. option:: rate_iops_min=int[,int][,int]
+
+	If fio doesn't meet this rate of I/O, it will cause the job to exit.
+	Comma-separated values may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
+
+.. option:: rate_process=str
+
+	This option controls how fio manages rated I/O submissions. The default is
+	`linear`, which submits I/O in a linear fashion with fixed delays between
+	I/Os that gets adjusted based on I/O completion rates. If this is set to
+	`poisson`, fio will submit I/O based on a more real world random request
+	flow, known as the Poisson process
+	(https://en.wikipedia.org/wiki/Poisson_point_process). The lambda will be
+	10^6 / IOPS for the given workload.
+
+
+I/O latency
+~~~~~~~~~~~
+
+.. option:: latency_target=time
+
+	If set, fio will attempt to find the max performance point that the given
+	workload will run at while maintaining a latency below this target.  When
+	the unit is omitted, the value is interpreted in microseconds.  See
+	:option:`latency_window` and :option:`latency_percentile`.
+
+.. option:: latency_window=time
+
+	Used with :option:`latency_target` to specify the sample window that the job
+	is run at varying queue depths to test the performance.  When the unit is
+	omitted, the value is interpreted in microseconds.
+
+.. option:: latency_percentile=float
+
+	The percentage of I/Os that must fall within the criteria specified by
+	:option:`latency_target` and :option:`latency_window`. If not set, this
+	defaults to 100.0, meaning that all I/Os must be equal or below to the value
+	set by :option:`latency_target`.
+
+.. option:: max_latency=time
+
+	If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
+	maximum latency. When the unit is omitted, the value is interpreted in
+	microseconds.
+
+.. option:: rate_cycle=int
+
+	Average bandwidth for :option:`rate` and :option:`rate_min` over this number
+	of milliseconds. Defaults to 1000.
+
+
+I/O replay
+~~~~~~~~~~
+
+.. option:: write_iolog=str
+
+	Write the issued I/O patterns to the specified file. See
+	:option:`read_iolog`.  Specify a separate file for each job, otherwise the
+	iologs will be interspersed and the file may be corrupt.
+
+.. option:: read_iolog=str
+
+	Open an iolog with the specified filename and replay the I/O patterns it
+	contains. This can be used to store a workload and replay it sometime
+	later. The iolog given may also be a blktrace binary file, which allows fio
+	to replay a workload captured by :command:`blktrace`. See
+	:manpage:`blktrace(8)` for how to capture such logging data. For blktrace
+	replay, the file needs to be turned into a blkparse binary data file first
+	(``blkparse <device> -o /dev/null -d file_for_fio.bin``).
+
+.. option:: replay_no_stall=bool
+
+	When replaying I/O with :option:`read_iolog` the default behavior is to
+	attempt to respect the timestamps within the log and replay them with the
+	appropriate delay between IOPS. By setting this variable fio will not
+	respect the timestamps and attempt to replay them as fast as possible while
+	still respecting ordering. The result is the same I/O pattern to a given
+	device, but different timings.
+
+.. option:: replay_redirect=str
+
+	While replaying I/O patterns using :option:`read_iolog` the default behavior
+	is to replay the IOPS onto the major/minor device that each IOP was recorded
+	from.  This is sometimes undesirable because on a different machine those
+	major/minor numbers can map to a different device.  Changing hardware on the
+	same system can also result in a different major/minor mapping.
+	``replay_redirect`` causes all I/Os to be replayed onto the single specified
+	device regardless of the device it was recorded
+	from. i.e. :option:`replay_redirect`\= :file:`/dev/sdc` would cause all I/O
+	in the blktrace or iolog to be replayed onto :file:`/dev/sdc`.  This means
+	multiple devices will be replayed onto a single device, if the trace
+	contains multiple devices. If you want multiple devices to be replayed
+	concurrently to multiple redirected devices you must blkparse your trace
+	into separate traces and replay them with independent fio invocations.
+	Unfortunately this also breaks the strict time ordering between multiple
+	device accesses.
+
+.. option:: replay_align=int
+
+	Force alignment of I/O offsets and lengths in a trace to this power of 2
+	value.
+
+.. option:: replay_scale=int
+
+	Scale sector offsets down by this factor when replaying traces.
+
+
+Threads, processes and job synchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: thread
+
+	Fio defaults to creating jobs by using fork, however if this option is
+	given, fio will create jobs by using POSIX Threads' function
+	:manpage:`pthread_create(3)` to create threads instead.
+
+.. option:: wait_for=str
+
+	If set, the current job won't be started until all workers of the specified
+	waitee job are done.
+
+	``wait_for`` operates on the job name basis, so there are a few
+	limitations. First, the waitee must be defined prior to the waiter job
+	(meaning no forward references). Second, if a job is being referenced as a
+	waitee, it must have a unique name (no duplicate waitees).
+
+.. option:: nice=int
+
+	Run the job with the given nice value. See man :manpage:`nice(2)`.
+
+	On Windows, values less than -15 set the process class to "High"; -1 through
+	-15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
+	priority class.
+
+.. option:: prio=int
+
+	Set the I/O priority value of this job. Linux limits us to a positive value
+	between 0 and 7, with 0 being the highest.  See man
+	:manpage:`ionice(1)`. Refer to an appropriate manpage for other operating
+	systems since meaning of priority may differ.
+
+.. option:: prioclass=int
+
+	Set the I/O priority class. See man :manpage:`ionice(1)`.
+
+.. option:: cpumask=int
+
+	Set the CPU affinity of this job. The parameter given is a bit mask of
+	allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
+	and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
+	:manpage:`sched_setaffinity(2)`. This may not work on all supported
+	operating systems or kernel versions. This option doesn't work well for a
+	higher CPU count than what you can store in an integer mask, so it can only
+	control cpus 1-32. For boxes with larger CPU counts, use
+	:option:`cpus_allowed`.
+
+.. option:: cpus_allowed=str
+
+	Controls the same options as :option:`cpumask`, but accepts a textual
+	specification of the permitted CPUs instead. So to use CPUs 1 and 5 you
+	would specify ``cpus_allowed=1,5``. This option also allows a range of CPUs
+	to be specified -- say you wanted a binding to CPUs 1, 5, and 8 to 15, you
+	would set ``cpus_allowed=1,5,8-15``.
+
+.. option:: cpus_allowed_policy=str
+
+	Set the policy of how fio distributes the CPUs specified by
+	:option:`cpus_allowed` or :option:`cpumask`. Two policies are supported:
+
+		**shared**
+			All jobs will share the CPU set specified.
+		**split**
+			Each job will get a unique CPU from the CPU set.
+
+	**shared** is the default behavior, if the option isn't specified. If
+	**split** is specified, then fio will will assign one cpu per job. If not
+	enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
+	in the set.
+
+.. option:: numa_cpu_nodes=str
+
+	Set this job running on specified NUMA nodes' CPUs. The arguments allow
+	comma delimited list of cpu numbers, A-B ranges, or `all`. Note, to enable
+	NUMA options support, fio must be built on a system with libnuma-dev(el)
+	installed.
+
+.. option:: numa_mem_policy=str
+
+	Set this job's memory policy and corresponding NUMA nodes. Format of the
+	arguments::
+
+		<mode>[:<nodelist>]
+
+	``mode`` is one of the following memory poicies: ``default``, ``prefer``,
+	``bind``, ``interleave`` or ``local``. For ``default`` and ``local`` memory
+	policies, no node needs to be specified.  For ``prefer``, only one node is
+	allowed.  For ``bind`` and ``interleave`` the ``nodelist`` may be as
+	follows: a comma delimited list of numbers, A-B ranges, or `all`.
+
+.. option:: cgroup=str
+
+	Add job to this control group. If it doesn't exist, it will be created. The
+	system must have a mounted cgroup blkio mount point for this to work. If
+	your system doesn't have it mounted, you can do so with::
+
+		# mount -t cgroup -o blkio none /cgroup
+
+.. option:: cgroup_weight=int
+
+	Set the weight of the cgroup to this value. See the documentation that comes
+	with the kernel, allowed values are in the range of 100..1000.
+
+.. option:: cgroup_nodelete=bool
+
+	Normally fio will delete the cgroups it has created after the job
+	completion. To override this behavior and to leave cgroups around after the
+	job completion, set ``cgroup_nodelete=1``.  This can be useful if one wants
+	to inspect various cgroup files after job completion. Default: false.
+
+.. option:: flow_id=int
+
+	The ID of the flow. If not specified, it defaults to being a global
+	flow. See :option:`flow`.
+
+.. option:: flow=int
+
+	Weight in token-based flow control. If this value is used, then there is a
+	'flow counter' which is used to regulate the proportion of activity between
+	two or more jobs. Fio attempts to keep this flow counter near zero. The
+	``flow`` parameter stands for how much should be added or subtracted to the
+	flow counter on each iteration of the main I/O loop. That is, if one job has
+	``flow=8`` and another job has ``flow=-1``, then there will be a roughly 1:8
+	ratio in how much one runs vs the other.
+
+.. option:: flow_watermark=int
+
+	The maximum value that the absolute value of the flow counter is allowed to
+	reach before the job must wait for a lower value of the counter.
+
+.. option:: flow_sleep=int
+
+	The period of time, in microseconds, to wait after the flow watermark has
+	been exceeded before retrying operations.
+
+.. option:: stonewall, wait_for_previous
+
+	Wait for preceding jobs in the job file to exit, before starting this
+	one. Can be used to insert serialization points in the job file. A stone
+	wall also implies starting a new reporting group, see
+	:option:`group_reporting`.
+
+.. option:: exitall
+
+	By default, fio will continue running all other jobs when one job finishes
+	but sometimes this is not the desired action.  Setting ``exitall`` will
+	instead make fio terminate all other jobs when one job finishes.
+
+.. option:: exec_prerun=str
+
+	Before running this job, issue the command specified through
+	:manpage:`system(3)`. Output is redirected in a file called
+	:file:`jobname.prerun.txt`.
+
+.. option:: exec_postrun=str
+
+	After the job completes, issue the command specified though
+	:manpage:`system(3)`. Output is redirected in a file called
+	:file:`jobname.postrun.txt`.
+
+.. option:: uid=int
+
+	Instead of running as the invoking user, set the user ID to this value
+	before the thread/process does any work.
+
+.. option:: gid=int
+
+	Set group ID, see :option:`uid`.
+
+
+Verification
+~~~~~~~~~~~~
+
+.. option:: verify_only
+
+	Do not perform specified workload, only verify data still matches previous
+	invocation of this workload. This option allows one to check data multiple
+	times at a later date without overwriting it. This option makes sense only
+	for workloads that write data, and does not support workloads with the
+	:option:`time_based` option set.
+
+.. option:: do_verify=bool
+
+	Run the verify phase after a write phase. Only valid if :option:`verify` is
+	set. Default: true.
+
+.. option:: verify=str
+
+	If writing to a file, fio can verify the file contents after each iteration
+	of the job. Each verification method also implies verification of special
+	header, which is written to the beginning of each block. This header also
+	includes meta information, like offset of the block, block number, timestamp
+	when block was written, etc.  :option:`verify` can be combined with
+	:option:`verify_pattern` option.  The allowed values are:
+
+		**md5**
+			Use an md5 sum of the data area and store it in the header of
+			each block.
+
+		**crc64**
+			Use an experimental crc64 sum of the data area and store it in the
+			header of each block.
+
+		**crc32c**
+			Use a crc32c sum of the data area and store it in the header of
+			each block. This will automatically use hardware acceleration
+			(e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
+			fall back to software crc32c if none is found. Generally the
+			fatest checksum fio supports when hardware accelerated.
+
+		**crc32c-intel**
+			Synonym for crc32c.
+
+		**crc32**
+			Use a crc32 sum of the data area and store it in the header of each
+			block.
+
+		**crc16**
+			Use a crc16 sum of the data area and store it in the header of each
+			block.
+
+		**crc7**
+			Use a crc7 sum of the data area and store it in the header of each
+			block.
+
+		**xxhash**
+			Use xxhash as the checksum function. Generally the fastest software
+			checksum that fio supports.
+
+		**sha512**
+			Use sha512 as the checksum function.
 
-1. Overview
-2. How fio works
-3. Running fio
-4. Job file format
-5. Detailed list of parameters
-6. Normal output
-7. Terse output
-8. Trace file format
-9. CPU idleness profiling
-10. Verification and triggers
-11. Log File Formats
-
-
-1.0 Overview and history
-------------------------
-fio was originally written to save me the hassle of writing special test
-case programs when I wanted to test a specific workload, either for
-performance reasons or to find/reproduce a bug. The process of writing
-such a test app can be tiresome, especially if you have to do it often.
-Hence I needed a tool that would be able to simulate a given io workload
-without resorting to writing a tailored test case again and again.
-
-A test work load is difficult to define, though. There can be any number
-of processes or threads involved, and they can each be using their own
-way of generating io. You could have someone dirtying large amounts of
-memory in an memory mapped file, or maybe several threads issuing
-reads using asynchronous io. fio needed to be flexible enough to
-simulate both of these cases, and many more.
+		**sha256**
+			Use sha256 as the checksum function.
 
-2.0 How fio works
------------------
-The first step in getting fio to simulate a desired io workload, is
-writing a job file describing that specific setup. A job file may contain
-any number of threads and/or files - the typical contents of the job file
-is a global section defining shared parameters, and one or more job
-sections describing the jobs involved. When run, fio parses this file
-and sets everything up as described. If we break down a job from top to
-bottom, it contains the following basic parameters:
+		**sha1**
+			Use optimized sha1 as the checksum function.
 
-	IO type		Defines the io pattern issued to the file(s).
-			We may only be reading sequentially from this
-			file(s), or we may be writing randomly. Or even
-			mixing reads and writes, sequentially or randomly.
+		**sha3-224**
+			Use optimized sha3-224 as the checksum function.
 
-	Block size	In how large chunks are we issuing io? This may be
-			a single value, or it may describe a range of
-			block sizes.
+		**sha3-256**
+			Use optimized sha3-256 as the checksum function.
 
-	IO size		How much data are we going to be reading/writing.
+		**sha3-384**
+			Use optimized sha3-384 as the checksum function.
 
-	IO engine	How do we issue io? We could be memory mapping the
-			file, we could be using regular read/write, we
-			could be using splice, async io, or even SG
-			(SCSI generic sg).
+		**sha3-512**
+			Use optimized sha3-512 as the checksum function.
 
-	IO depth	If the io engine is async, how large a queuing
-			depth do we want to maintain?
+		**meta**
+			This option is deprecated, since now meta information is included in
+			generic verification header and meta verification happens by
+			default. For detailed information see the description of the
+			:option:`verify` setting. This option is kept because of
+			compatibility's sake with old configurations. Do not use it.
 
-	IO type		Should we be doing buffered io, or direct/raw io?
+		**pattern**
+			Verify a strict pattern. Normally fio includes a header with some
+			basic information and checksumming, but if this option is set, only
+			the specific pattern set with :option:`verify_pattern` is verified.
 
-	Num files	How many files are we spreading the workload over.
+		**null**
+			Only pretend to verify. Useful for testing internals with
+			:option:`ioengine`\=null, not for much else.
 
-	Num threads	How many threads or processes should we spread
-			this workload over.
+	This option can be used for repeated burn-in tests of a system to make sure
+	that the written data is also correctly read back. If the data direction
+	given is a read or random read, fio will assume that it should verify a
+	previously written file. If the data direction includes any form of write,
+	the verify will be of the newly written data.
 
-The above are the basic parameters defined for a workload, in addition
-there's a multitude of parameters that modify other aspects of how this
-job behaves.
+.. option:: verifysort=bool
 
+	If true, fio will sort written verify blocks when it deems it faster to read
+	them back in a sorted manner. This is often the case when overwriting an
+	existing file, since the blocks are already laid out in the file system. You
+	can ignore this option unless doing huge amounts of really fast I/O where
+	the red-black tree sorting CPU time becomes significant. Default: true.
 
-3.0 Running fio
----------------
-See the README file for command line parameters, there are only a few
-of them.
+.. option:: verifysort_nr=int
 
-Running fio is normally the easiest part - you just give it the job file
-(or job files) as parameters:
+	Pre-load and sort verify blocks for a read workload.
 
-$ fio job_file
+.. option:: verify_offset=int
 
-and it will start doing what the job_file tells it to do. You can give
-more than one job file on the command line, fio will serialize the running
-of those files. Internally that is the same as using the 'stonewall'
-parameter described in the parameter section.
-
-If the job file contains only one job, you may as well just give the
-parameters on the command line. The command line parameters are identical
-to the job parameters, with a few extra that control global parameters
-(see README). For example, for the job file parameter iodepth=2, the
-mirror command line option would be --iodepth 2 or --iodepth=2. You can
-also use the command line for giving more than one job entry. For each
---name option that fio sees, it will start a new job with that name.
-Command line entries following a --name entry will apply to that job,
-until there are no more entries or a new --name entry is seen. This is
-similar to the job file options, where each option applies to the current
-job until a new [] job entry is seen.
-
-fio does not need to run as root, except if the files or devices specified
-in the job section requires that. Some other options may also be restricted,
-such as memory locking, io scheduler switching, and decreasing the nice value.
+	Swap the verification header with data somewhere else in the block before
+	writing. It is swapped back before verifying.
 
+.. option:: verify_interval=int
 
-4.0 Job file format
--------------------
-As previously described, fio accepts one or more job files describing
-what it is supposed to do. The job file format is the classic ini file,
-where the names enclosed in [] brackets define the job name. You are free
-to use any ascii name you want, except 'global' which has special meaning.
-A global section sets defaults for the jobs described in that file. A job
-may override a global section parameter, and a job file may even have
-several global sections if so desired. A job is only affected by a global
-section residing above it. If the first character in a line is a ';' or a
-'#', the entire line is discarded as a comment.
+	Write the verification header at a finer granularity than the
+	:option:`blocksize`. It will be written for chunks the size of
+	``verify_interval``. :option:`blocksize` should divide this evenly.
 
-So let's look at a really simple job file that defines two processes, each
-randomly reading from a 128MB file.
+.. option:: verify_pattern=str
 
-; -- start job file --
-[global]
-rw=randread
-size=128m
+	If set, fio will fill the I/O buffers with this pattern. Fio defaults to
+	filling with totally random bytes, but sometimes it's interesting to fill
+	with a known pattern for I/O verification purposes. Depending on the width
+	of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
+	be either a decimal or a hex number).  The ``verify_pattern`` if larger than
+	a 32-bit quantity has to be a hex number that starts with either "0x" or
+	"0X". Use with :option:`verify`. Also, ``verify_pattern`` supports %o
+	format, which means that for each block offset will be written and then
+	verified back, e.g.::
 
-[job1]
+		verify_pattern=%o
 
-[job2]
+	Or use combination of everything::
 
-; -- end job file --
+		verify_pattern=0xff%o"abcd"-12
 
-As you can see, the job file sections themselves are empty as all the
-described parameters are shared. As no filename= option is given, fio
-makes up a filename for each of the jobs as it sees fit. On the command
-line, this job would look as follows:
+.. option:: verify_fatal=bool
 
-$ fio --name=global --rw=randread --size=128m --name=job1 --name=job2
+	Normally fio will keep checking the entire contents before quitting on a
+	block verification failure. If this option is set, fio will exit the job on
+	the first observed failure. Default: false.
 
+.. option:: verify_dump=bool
 
-Let's look at an example that has a number of processes writing randomly
-to files.
+	If set, dump the contents of both the original data block and the data block
+	we read off disk to files. This allows later analysis to inspect just what
+	kind of data corruption occurred. Off by default.
 
-; -- start job file --
-[random-writers]
-ioengine=libaio
-iodepth=4
-rw=randwrite
-bs=32k
-direct=0
-size=64m
-numjobs=4
-
-; -- end job file --
-
-Here we have no global section, as we only have one job defined anyway.
-We want to use async io here, with a depth of 4 for each file. We also
-increased the buffer size used to 32KB and define numjobs to 4 to
-fork 4 identical jobs. The result is 4 processes each randomly writing
-to their own 64MB file. Instead of using the above job file, you could
-have given the parameters on the command line. For this case, you would
-specify:
+.. option:: verify_async=int
 
-$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
+	Fio will normally verify I/O inline from the submitting thread. This option
+	takes an integer describing how many async offload threads to create for I/O
+	verification instead, causing fio to offload the duty of verifying I/O
+	contents to one or more separate threads. If using this offload option, even
+	sync I/O engines can benefit from using an :option:`iodepth` setting higher
+	than 1, as it allows them to have I/O in flight while verifies are running.
+	Defaults to 0 async threads, i.e. verification is not asynchronous.
 
-When fio is utilized as a basis of any reasonably large test suite, it might be
-desirable to share a set of standardized settings across multiple job files.
-Instead of copy/pasting such settings, any section may pull in an external
-.fio file with 'include filename' directive, as in the following example:
+.. option:: verify_async_cpus=str
 
-; -- start job file including.fio --
-[global]
-filename=/tmp/test
-filesize=1m
-include glob-include.fio
-
-[test]
-rw=randread
-bs=4k
-time_based=1
-runtime=10
-include test-include.fio
-; -- end job file including.fio --
-
-; -- start job file glob-include.fio --
-thread=1
-group_reporting=1
-; -- end job file glob-include.fio --
-
-; -- start job file test-include.fio --
-ioengine=libaio
-iodepth=4
-; -- end job file test-include.fio --
-
-Settings pulled into a section apply to that section only (except global
-section). Include directives may be nested in that any included file may
-contain further include directive(s). Include files may not contain []
-sections.
+	Tell fio to set the given CPU affinity on the async I/O verification
+	threads. See :option:`cpus_allowed` for the format used.
 
+.. option:: verify_backlog=int
 
-4.1 Environment variables
--------------------------
+	Fio will normally verify the written contents of a job that utilizes verify
+	once that job has completed. In other words, everything is written then
+	everything is read back and verified. You may want to verify continually
+	instead for a variety of reasons. Fio stores the meta data associated with
+	an I/O block in memory, so for large verify workloads, quite a bit of memory
+	would be used up holding this meta data. If this option is enabled, fio will
+	write only N blocks before verifying these blocks.
 
-fio also supports environment variable expansion in job files. Any
-sub-string of the form "${VARNAME}" as part of an option value (in other
-words, on the right of the `='), will be expanded to the value of the
-environment variable called VARNAME.  If no such environment variable
-is defined, or VARNAME is the empty string, the empty string will be
-substituted.
+.. option:: verify_backlog_batch=int
 
-As an example, let's look at a sample fio invocation and job file:
+	Control how many blocks fio will verify if :option:`verify_backlog` is
+	set. If not set, will default to the value of :option:`verify_backlog`
+	(meaning the entire queue is read back and verified).  If
+	``verify_backlog_batch`` is less than :option:`verify_backlog` then not all
+	blocks will be verified, if ``verify_backlog_batch`` is larger than
+	:option:`verify_backlog`, some blocks will be verified more than once.
 
-$ SIZE=64m NUMJOBS=4 fio jobfile.fio
+.. option:: verify_state_save=bool
 
-; -- start job file --
-[random-writers]
-rw=randwrite
-size=${SIZE}
-numjobs=${NUMJOBS}
-; -- end job file --
+	When a job exits during the write phase of a verify workload, save its
+	current state. This allows fio to replay up until that point, if the verify
+	state is loaded for the verify read phase. The format of the filename is,
+	roughly::
 
-This will expand to the following equivalent job file at runtime:
+		<type>-<jobname>-<jobindex>-verify.state.
 
-; -- start job file --
-[random-writers]
-rw=randwrite
-size=64m
-numjobs=4
-; -- end job file --
+	<type> is "local" for a local run, "sock" for a client/server socket
+	connection, and "ip" (192.168.0.1, for instance) for a networked
+	client/server connection. Defaults to true.
 
-fio ships with a few example job files, you can also look there for
-inspiration.
+.. option:: verify_state_load=bool
 
-4.2 Reserved keywords
----------------------
+	If a verify termination trigger was used, fio stores the current write state
+	of each thread. This can be used at verification time so that fio knows how
+	far it should verify.  Without this information, fio will run a full
+	verification pass, according to the settings in the job file used.  Default
+	false.
 
-Additionally, fio has a set of reserved keywords that will be replaced
-internally with the appropriate value. Those keywords are:
+.. option:: trim_percentage=int
 
-$pagesize	The architecture page size of the running system
-$mb_memory	Megabytes of total memory in the system
-$ncpus		Number of online available CPUs
+	Number of verify blocks to discard/trim.
 
-These can be used on the command line or in the job file, and will be
-automatically substituted with the current system values when the job
-is run. Simple math is also supported on these keywords, so you can
-perform actions like:
-
-size=8*$mb_memory
-
-and get that properly expanded to 8 times the size of memory in the
-machine.
-
-
-5.0 Detailed list of parameters
--------------------------------
-
-This section describes in details each parameter associated with a job.
-Some parameters take an option of a given type, such as an integer or
-a string. Anywhere a numeric value is required, an arithmetic expression
-may be used, provided it is surrounded by parentheses. Supported operators
-are:
-
-	addition (+)
-	subtraction (-)
-	multiplication (*)
-	division (/)
-	modulus (%)
-	exponentiation (^)
+.. option:: trim_verify_zero=bool
 
-For time values in expressions, units are microseconds by default. This is
-different than for time values not in expressions (not enclosed in
-parentheses). The following types are used:
+	Verify that trim/discarded blocks are returned as zeros.
 
-str	String. This is a sequence of alpha characters.
-time	Integer with possible time suffix. In seconds unless otherwise
-	specified, use eg 10m for 10 minutes. Accepts s/m/h for seconds,
-	minutes, and hours, and accepts 'ms' (or 'msec') for milliseconds,
-	and 'us' (or 'usec') for microseconds.
-int	SI integer. A whole number value, which may contain a suffix
-	describing the base of the number. Accepted suffixes are k/m/g/t/p,
-	meaning kilo, mega, giga, tera, and peta. The suffix is not case
-	sensitive, and you may also include trailing 'b' (eg 'kb' is the same
-	as 'k'). So if you want to specify 4096, you could either write
-	out '4096' or just give 4k. The suffixes signify base 2 values, so
-	1024 is 1k and 1024k is 1m and so on, unless the suffix is explicitly
-	set to a base 10 value using 'kib', 'mib', 'gib', etc. If that is the
-	case, then 1000 is used as the multiplier. This can be handy for
-	disks, since manufacturers generally use base 10 values when listing
-	the capacity of a drive. If the option accepts an upper and lower
-	range, use a colon ':' or minus '-' to separate such values.  May also
-	include a prefix to indicate numbers base. If 0x is used, the number
-	is assumed to be hexadecimal.  See irange.
-bool	Boolean. Usually parsed as an integer, however only defined for
-	true and false (1 and 0).
-irange	Integer range with suffix. Allows value range to be given, such
-	as 1024-4096. A colon may also be used as the separator, eg
-	1k:4k. If the option allows two sets of ranges, they can be
-	specified with a ',' or '/' delimiter: 1k-4k/8k-32k. Also see
-	int.
-float_list	A list of floating point numbers, separated by a ':' character.
-
-With the above in mind, here follows the complete list of fio job
-parameters.
-
-name=str	ASCII name of the job. This may be used to override the
-		name printed by fio for this job. Otherwise the job
-		name is used. On the command line this parameter has the
-		special purpose of also signaling the start of a new
-		job.
-
-wait_for=str	Specifies the name of the already defined job to wait
-		for. Single waitee name only may be specified. If set, the job
-		won't be started until all workers of the waitee job are done.
-
-		Wait_for operates on the job name basis, so there are a few
-		limitations. First, the waitee must be defined prior to the
-		waiter job (meaning no forward references). Second, if a job
-		is being referenced as a waitee, it must have a unique name
-		(no duplicate waitees).
-
-description=str	Text description of the job. Doesn't do anything except
-		dump this text description when this job is run. It's
-		not parsed.
-
-directory=str	Prefix filenames with this directory. Used to place files
-		in a different location than "./". See the 'filename' option
-		for escaping certain characters.
-
-filename=str	Fio normally makes up a filename based on the job name,
-		thread number, and file number. If you want to share
-		files between threads in a job or several jobs, specify
-		a filename for each of them to override the default.
-		If the ioengine is file based, you can specify a number of
-		files by separating the names with a ':' colon. So if you
-		wanted a job to open /dev/sda and /dev/sdb as the two working
-		files, you would use filename=/dev/sda:/dev/sdb. On Windows,
-		disk devices are accessed as \\.\PhysicalDrive0 for the first
-		device, \\.\PhysicalDrive1 for the second etc. Note: Windows
-		and FreeBSD prevent write access to areas of the disk
-		containing in-use data (e.g. filesystems).
-		If the wanted filename does need to include a colon, then
-		escape that with a '\' character. For instance, if the filename
-		is "/dev/dsk/foo@3,0:c", then you would use
-		filename="/dev/dsk/foo@3,0\:c". '-' is a reserved name, meaning
-		stdin or stdout. Which of the two depends on the read/write
-		direction set.
-
-filename_format=str
-		If sharing multiple files between jobs, it is usually necessary
-		to  have fio generate the exact names that you want. By default,
-		fio will name a file based on the default file format
-		specification of jobname.jobnumber.filenumber. With this
-		option, that can be customized. Fio will recognize and replace
-		the following keywords in this string:
-
-		$jobname
-			The name of the worker thread or process.
-
-		$jobnum
-			The incremental number of the worker thread or
-			process.
-
-		$filenum
-			The incremental number of the file for that worker
-			thread or process.
-
-		To have dependent jobs share a set of files, this option can
-		be set to have fio generate filenames that are shared between
-		the two. For instance, if testfiles.$filenum is specified,
-		file number 4 for any job will be named testfiles.4. The
-		default of $jobname.$jobnum.$filenum will be used if
-		no other format specifier is given.
-
-unique_filename=bool	To avoid collisions between networked clients, fio
-		defaults to prefixing any generated filenames (with a directory
-		specified) with the source of the client connecting. To disable
-		this behavior, set this option to 0.
-
-opendir=str	Tell fio to recursively add any file it can find in this
-		directory and down the file system tree.
-
-lockfile=str	Fio defaults to not locking any files before it does
-		IO to them. If a file or file descriptor is shared, fio
-		can serialize IO to that file to make the end result
-		consistent. This is usual for emulating real workloads that
-		share files. The lock modes are:
-
-			none		No locking. The default.
-			exclusive	Only one thread/process may do IO,
-					excluding all others.
-			readwrite	Read-write locking on the file. Many
-					readers may access the file at the
-					same time, but writes get exclusive
-					access.
-
-readwrite=str
-rw=str		Type of io pattern. Accepted values are:
-
-			read		Sequential reads
-			write		Sequential writes
-			randwrite	Random writes
-			randread	Random reads
-			rw,readwrite	Sequential mixed reads and writes
-			randrw		Random mixed reads and writes
-			trimwrite	Mixed trims and writes. Blocks will be
-					trimmed first, then written to.
-
-		Fio defaults to read if the option is not specified.
-		For the mixed io types, the default is to split them 50/50.
-		For certain types of io the result may still be skewed a bit,
-		since the speed may be different. It is possible to specify
-		a number of IO's to do before getting a new offset, this is
-		done by appending a ':<nr>' to the end of the string given.
-		For a random read, it would look like 'rw=randread:8' for
-		passing in an offset modifier with a value of 8. If the
-		suffix is used with a sequential IO pattern, then the value
-		specified will be added to the generated offset for each IO.
-		For instance, using rw=write:4k will skip 4k for every
-		write. It turns sequential IO into sequential IO with holes.
-		See the 'rw_sequencer' option.
-
-rw_sequencer=str If an offset modifier is given by appending a number to
-		the rw=<str> line, then this option controls how that
-		number modifies the IO offset being generated. Accepted
-		values are:
-
-			sequential	Generate sequential offset
-			identical	Generate the same offset
-
-		'sequential' is only useful for random IO, where fio would
-		normally generate a new random offset for every IO. If you
-		append eg 8 to randread, you would get a new random offset for
-		every 8 IO's. The result would be a seek for only every 8
-		IO's, instead of for every IO. Use rw=randread:8 to specify
-		that. As sequential IO is already sequential, setting
-		'sequential' for that would not result in any differences.
-		'identical' behaves in a similar fashion, except it sends
-		the same offset 8 number of times before generating a new
-		offset.
-
-kb_base=int	The base unit for a kilobyte. The defacto base is 2^10, 1024.
-		Storage manufacturers like to use 10^3 or 1000 as a base
-		ten unit instead, for obvious reasons. Allow values are
-		1024 or 1000, with 1024 being the default.
-
-unified_rw_reporting=bool	Fio normally reports statistics on a per
-		data direction basis, meaning that read, write, and trim are
-		accounted and reported separately. If this option is set,
-		the fio will sum the results and report them as "mixed"
-		instead.
-
-randrepeat=bool	For random IO workloads, seed the generator in a predictable
-		way so that results are repeatable across repetitions.
-		Defaults to true.
-
-randseed=int	Seed the random number generators based on this seed value, to
-		be able to control what sequence of output is being generated.
-		If not set, the random sequence depends on the randrepeat
-		setting.
-
-fallocate=str	Whether pre-allocation is performed when laying down files.
-		Accepted values are:
-
-			none		Do not pre-allocate space
-			posix		Pre-allocate via posix_fallocate()
-			keep		Pre-allocate via fallocate() with
-					FALLOC_FL_KEEP_SIZE set
-			0		Backward-compatible alias for 'none'
-			1		Backward-compatible alias for 'posix'
-
-		May not be available on all supported platforms. 'keep' is only
-		available on Linux.If using ZFS on Solaris this must be set to
-		'none' because ZFS doesn't support it. Default: 'posix'.
-
-fadvise_hint=bool By default, fio will use fadvise() to advise the kernel
-		on what IO patterns it is likely to issue. Sometimes you
-		want to test specific IO patterns without telling the
-		kernel about it, in which case you can disable this option.
-		The following options are supported:
-
-			sequential	Use FADV_SEQUENTIAL
-			random		Use FADV_RANDOM
-			1		Backwards-compatible hint for basing
-					the hint on the fio workload. Will use
-					FADV_SEQUENTIAL for a sequential
-					workload, and FADV_RANDOM for a random
-					workload.
-			0		Backwards-compatible setting for not
-					issing a fadvise hint.
-
-fadvise_stream=int Notify the kernel what write stream ID to place these
-		writes under. Only supported on Linux. Note, this option
-		may change going forward.
-
-size=int	The total size of file io for this job. Fio will run until
-		this many bytes has been transferred, unless runtime is
-		limited by other options (such as 'runtime', for instance,
-		or increased/decreased by 'io_size'). Unless specific nrfiles
-		and filesize options are given, fio will divide this size
-		between the available files specified by the job. If not set,
-		fio will use the full size of the given files or devices.
-		If the files do not exist, size must be given. It is also
-		possible to give size as a percentage between 1 and 100. If
-		size=20% is given, fio will use 20% of the full size of the
-		given files or devices.
-
-io_size=int
-io_limit=int	Normally fio operates within the region set by 'size', which
-		means that the 'size' option sets both the region and size of
-		IO to be performed. Sometimes that is not what you want. With
-		this option, it is possible to define just the amount of IO
-		that fio should do. For instance, if 'size' is set to 20G and
-		'io_size' is set to 5G, fio will perform IO within the first
-		20G but exit when 5G have been done. The opposite is also
-		possible - if 'size' is set to 20G, and 'io_size' is set to
-		40G, then fio will do 40G of IO within the 0..20G region.
-
-filesize=int	Individual file sizes. May be a range, in which case fio
-		will select sizes for files at random within the given range
-		and limited to 'size' in total (if that is given). If not
-		given, each created file is the same size.
-
-file_append=bool	Perform IO after the end of the file. Normally fio will
-		operate within the size of a file. If this option is set, then
-		fio will append to the file instead. This has identical
-		behavior to setting offset to the size of a file. This option
-		is ignored on non-regular files.
-
-fill_device=bool
-fill_fs=bool	Sets size to something really large and waits for ENOSPC (no
-		space left on device) as the terminating condition. Only makes
-		sense with sequential write. For a read workload, the mount
-		point will be filled first then IO started on the result. This
-		option doesn't make sense if operating on a raw device node,
-		since the size of that is already known by the file system.
-		Additionally, writing beyond end-of-device will not return
-		ENOSPC there.
-
-blocksize=int
-bs=int		The block size used for the io units. Defaults to 4k. Values
-		can be given for both read and writes. If a single int is
-		given, it will apply to both. If a second int is specified
-		after a comma, it will apply to writes only. In other words,
-		the format is either bs=read_and_write or bs=read,write,trim.
-		bs=4k,8k will thus use 4k blocks for reads, 8k blocks for
-		writes, and 8k for trims. You can terminate the list with
-		a trailing comma. bs=4k,8k, would use the default value for
-		trims.. If you only wish to set the write size, you
-		can do so by passing an empty read size - bs=,8k will set
-		8k for writes and leave the read default value.
-
-blockalign=int
-ba=int		At what boundary to align random IO offsets. Defaults to
-		the same as 'blocksize' the minimum blocksize given.
-		Minimum alignment is typically 512b for using direct IO,
-		though it usually depends on the hardware block size. This
-		option is mutually exclusive with using a random map for
-		files, so it will turn off that option.
-
-blocksize_range=irange
-bsrange=irange	Instead of giving a single block size, specify a range
-		and fio will mix the issued io block sizes. The issued
-		io unit will always be a multiple of the minimum value
-		given (also see bs_unaligned). Applies to both reads and
-		writes, however a second range can be given after a comma.
-		See bs=.
-
-bssplit=str	Sometimes you want even finer grained control of the
-		block sizes issued, not just an even split between them.
-		This option allows you to weight various block sizes,
-		so that you are able to define a specific amount of
-		block sizes issued. The format for this option is:
-
-			bssplit=blocksize/percentage:blocksize/percentage
-
-		for as many block sizes as needed. So if you want to define
-		a workload that has 50% 64k blocks, 10% 4k blocks, and
-		40% 32k blocks, you would write:
-
-			bssplit=4k/10:64k/50:32k/40
-
-		Ordering does not matter. If the percentage is left blank,
-		fio will fill in the remaining values evenly. So a bssplit
-		option like this one:
-
-			bssplit=4k/50:1k/:32k/
-
-		would have 50% 4k ios, and 25% 1k and 32k ios. The percentages
-		always add up to 100, if bssplit is given a range that adds
-		up to more, it will error out.
-
-		bssplit also supports giving separate splits to reads and
-		writes. The format is identical to what bs= accepts. You
-		have to separate the read and write parts with a comma. So
-		if you want a workload that has 50% 2k reads and 50% 4k reads,
-		while having 90% 4k writes and 10% 8k writes, you would
-		specify:
-
-		bssplit=2k/50:4k/50,4k/90:8k/10
-
-blocksize_unaligned
-bs_unaligned	If this option is given, any byte size value within bsrange
-		may be used as a block range. This typically wont work with
-		direct IO, as that normally requires sector alignment.
-
-bs_is_seq_rand	If this option is set, fio will use the normal read,write
-		blocksize settings as sequential,random instead. Any random
-		read or write will use the WRITE blocksize settings, and any
-		sequential read or write will use the READ blocksize setting.
-
-zero_buffers	If this option is given, fio will init the IO buffers to
-		all zeroes. The default is to fill them with random data.
-
-refill_buffers	If this option is given, fio will refill the IO buffers
-		on every submit. The default is to only fill it at init
-		time and reuse that data. Only makes sense if zero_buffers
-		isn't specified, naturally. If data verification is enabled,
-		refill_buffers is also automatically enabled.
-
-scramble_buffers=bool	If refill_buffers is too costly and the target is
-		using data deduplication, then setting this option will
-		slightly modify the IO buffer contents to defeat normal
-		de-dupe attempts. This is not enough to defeat more clever
-		block compression attempts, but it will stop naive dedupe of
-		blocks. Default: true.
-
-buffer_compress_percentage=int	If this is set, then fio will attempt to
-		provide IO buffer content (on WRITEs) that compress to
-		the specified level. Fio does this by providing a mix of
-		random data and a fixed pattern. The fixed pattern is either
-		zeroes, or the pattern specified by buffer_pattern. If the
-		pattern option is used, it might skew the compression ratio
-		slightly. Note that this is per block size unit, for file/disk
-		wide compression level that matches this setting, you'll also
-		want to set refill_buffers.
-
-buffer_compress_chunk=int	See buffer_compress_percentage. This
-		setting allows fio to manage how big the ranges of random
-		data and zeroed data is. Without this set, fio will
-		provide buffer_compress_percentage of blocksize random
-		data, followed by the remaining zeroed. With this set
-		to some chunk size smaller than the block size, fio can
-		alternate random and zeroed data throughout the IO
-		buffer.
-
-buffer_pattern=str	If set, fio will fill the io buffers with this
-		pattern. If not set, the contents of io buffers is defined by
-		the other options related to buffer contents. The setting can
-		be any pattern of bytes, and can be prefixed with 0x for hex
-		values. It may also be a string, where the string must then
-		be wrapped with "", e.g.:
+.. option:: trim_backlog=int
 
-		buffer_pattern="abcd"
-		  or
-		buffer_pattern=-12
-		  or
-		buffer_pattern=0xdeadface
+	Trim after this number of blocks are written.
 
-		Also you can combine everything together in any order:
-		buffer_pattern=0xdeadface"abcd"-12
+.. option:: trim_backlog_batch=int
 
-dedupe_percentage=int	If set, fio will generate this percentage of
-		identical buffers when writing. These buffers will be
-		naturally dedupable. The contents of the buffers depend on
-		what other buffer compression settings have been set. It's
-		possible to have the individual buffers either fully
-		compressible, or not at all. This option only controls the
-		distribution of unique buffers.
-
-nrfiles=int	Number of files to use for this job. Defaults to 1.
-
-openfiles=int	Number of files to keep open at the same time. Defaults to
-		the same as nrfiles, can be set smaller to limit the number
-		simultaneous opens.
-
-file_service_type=str  Defines how fio decides which file from a job to
-		service next. The following types are defined:
-
-			random	Just choose a file at random.
-
-			roundrobin  Round robin over open files. This
-				is the default.
-
-			sequential  Finish one file before moving on to
-				the next. Multiple files can still be
-				open depending on 'openfiles'.
-
-			zipf	Use a zipfian distribution to decide what file
-				to access.
-
-			pareto	Use a pareto distribution to decide what file
-				to access.
-
-			gauss	Use a gaussian (normal) distribution to decide
-				what file to access.
-
-		For random, roundrobin, and sequential, a postfix can be
-		appended to tell fio how many I/Os to issue before switching
-		to a new file. For example, specifying
-		'file_service_type=random:8' would cause fio to issue 8 I/Os
-		before selecting a new file at random. For the non-uniform
-		distributions, a floating point postfix can be given to
-		influence how the distribution is skewed. See
-		'random_distribution' for a description of how that would work.
-
-ioengine=str	Defines how the job issues io to the file. The following
-		types are defined:
-
-			sync	Basic read(2) or write(2) io. lseek(2) is
-				used to position the io location.
-
-			psync 	Basic pread(2) or pwrite(2) io. Default on all
-				supported operating systems except for Windows.
-
-			vsync	Basic readv(2) or writev(2) IO.
-
-			pvsync	Basic preadv(2) or pwritev(2) IO.
-
-			pvsync2	Basic preadv2(2) or pwritev2(2) IO.
-
-			libaio	Linux native asynchronous io. Note that Linux
-				may only support queued behaviour with
-				non-buffered IO (set direct=1 or buffered=0).
-				This engine defines engine specific options.
-
-			posixaio glibc posix asynchronous io.
-
-			solarisaio Solaris native asynchronous io.
-
-			windowsaio Windows native asynchronous io.
-				Default on Windows.
-
-			mmap	File is memory mapped and data copied
-				to/from using memcpy(3).
-
-			splice	splice(2) is used to transfer the data and
-				vmsplice(2) to transfer data from user
-				space to the kernel.
-
-			sg	SCSI generic sg v3 io. May either be
-				synchronous using the SG_IO ioctl, or if
-				the target is an sg character device
-				we use read(2) and write(2) for asynchronous
-				io.
-
-			null	Doesn't transfer any data, just pretends
-				to. This is mainly used to exercise fio
-				itself and for debugging/testing purposes.
-
-			net	Transfer over the network to given host:port.
-				Depending on the protocol used, the hostname,
-				port, listen and filename options are used to
-				specify what sort of connection to make, while
-				the protocol option determines which protocol
-				will be used.
-				This engine defines engine specific options.
-
-			netsplice Like net, but uses splice/vmsplice to
-				map data and send/receive.
-				This engine defines engine specific options.
-
-			cpuio	Doesn't transfer any data, but burns CPU
-				cycles according to the cpuload= and
-				cpuchunks= options. Setting cpuload=85
-				will cause that job to do nothing but burn
-				85% of the CPU. In case of SMP machines,
-				use numjobs=<no_of_cpu> to get desired CPU
-				usage, as the cpuload only loads a single
-				CPU at the desired rate. A job never finishes
-				unless there is at least one non-cpuio job.
-
-			guasi	The GUASI IO engine is the Generic Userspace
-				Asyncronous Syscall Interface approach
-				to async IO. See
-
-				http://www.xmailserver.org/guasi-lib.html
-
-				for more info on GUASI.
-
-			rdma    The RDMA I/O engine  supports  both  RDMA
-				memory semantics (RDMA_WRITE/RDMA_READ) and
-				channel semantics (Send/Recv) for the
-				InfiniBand, RoCE and iWARP protocols.
-
-			falloc	IO engine that does regular fallocate to
-				simulate data transfer as fio ioengine.
-				DDIR_READ  does fallocate(,mode = keep_size,)
-				DDIR_WRITE does fallocate(,mode = 0)
-				DDIR_TRIM  does fallocate(,mode = punch_hole)
-
-			e4defrag IO engine that does regular EXT4_IOC_MOVE_EXT
-				ioctls to simulate defragment activity in
-				request to DDIR_WRITE event
-
-			rbd	IO engine supporting direct access to Ceph
-				Rados Block Devices (RBD) via librbd without
-				the need to use the kernel rbd driver. This
-				ioengine defines engine specific options.
-
-			gfapi	Using Glusterfs libgfapi sync interface to
-				direct access to Glusterfs volumes without
-				options.
-
-			gfapi_async Using Glusterfs libgfapi async interface
-				to direct access to Glusterfs volumes without
-				having to go through FUSE. This ioengine
-				defines engine specific options.
-
-			libhdfs	Read and write through Hadoop (HDFS).
-				This engine interprets offsets a little
-				differently. In HDFS, files once created
-				cannot be modified. So random writes are not
-				possible. To imitate this, libhdfs engine
-				creates bunch of small files, and engine will
-				pick a file out of those files based on the
-				offset generated by fio backend. Each jobs uses
-				it's own connection to HDFS.
-
-			mtd	Read, write and erase an MTD character device
-				(e.g., /dev/mtd0). Discards are treated as
-				erases. Depending on the underlying device
-				type, the I/O may have to go in a certain
-				pattern, e.g., on NAND, writing sequentially
-				to erase blocks and discarding before
-				overwriting. The writetrim mode works well
-				for this constraint.
-
-			pmemblk	Read and write through the NVML libpmemblk
-				interface.
-
-			dev-dax Read and write through a DAX device exposed
-				from persistent memory.
-
-			external Prefix to specify loading an external
-				IO engine object file. Append the engine
-				filename, eg ioengine=external:/tmp/foo.o
-				to load ioengine foo.o in /tmp.
-
-iodepth=int	This defines how many io units to keep in flight against
-		the file. The default is 1 for each file defined in this
-		job, can be overridden with a larger value for higher
-		concurrency. Note that increasing iodepth beyond 1 will not
-		affect synchronous ioengines (except for small degress when
-		verify_async is in use). Even async engines may impose OS
-		restrictions causing the desired depth not to be achieved.
-		This may happen on Linux when using libaio and not setting
-		direct=1, since buffered IO is not async on that OS. Keep an
-		eye on the IO depth distribution in the fio output to verify
-		that the achieved depth is as expected. Default: 1.
-
-iodepth_batch_submit=int
-iodepth_batch=int This defines how many pieces of IO to submit at once.
-		It defaults to 1 which means that we submit each IO
-		as soon as it is available, but can be raised to submit
-		bigger batches of IO at the time. If it is set to 0 the iodepth
-		value will be used.
-
-iodepth_batch_complete_min=int
-iodepth_batch_complete=int This defines how many pieces of IO to retrieve
-		at once. It defaults to 1 which means that we'll ask
-		for a minimum of 1 IO in the retrieval process from
-		the kernel. The IO retrieval will go on until we
-		hit the limit set by iodepth_low. If this variable is
-		set to 0, then fio will always check for completed
-		events before queuing more IO. This helps reduce
-		IO latency, at the cost of more retrieval system calls.
-
-iodepth_batch_complete_max=int This defines maximum pieces of IO to
-		retrieve at once. This variable should be used along with
-		iodepth_batch_complete_min=int variable, specifying the range
-		of min and max amount of IO which should be retrieved. By default
-		it is equal to iodepth_batch_complete_min value.
+	Trim this number of I/O blocks.
 
-		Example #1:
+.. option:: experimental_verify=bool
 
-		iodepth_batch_complete_min=1
-		iodepth_batch_complete_max=<iodepth>
+	Enable experimental verification.
 
-		which means that we will retrieve at least 1 IO and up to the
-		whole submitted queue depth. If none of IO has been completed
-		yet, we will wait.
+Steady state
+~~~~~~~~~~~~
 
-		Example #2:
+.. option:: steadystate=str:float, ss=str:float
 
-		iodepth_batch_complete_min=0
-		iodepth_batch_complete_max=<iodepth>
+	Define the criterion and limit for assessing steady state performance. The
+	first parameter designates the criterion whereas the second parameter sets
+	the threshold. When the criterion falls below the threshold for the
+	specified duration, the job will stop. For example, `iops_slope:0.1%` will
+	direct fio to terminate the job when the least squares regression slope
+	falls below 0.1% of the mean IOPS. If :option:`group_reporting` is enabled
+	this will apply to all jobs in the group. Below is the list of available
+	steady state assessment criteria. All assessments are carried out using only
+	data from the rolling collection window. Threshold limits can be expressed
+	as a fixed value or as a percentage of the mean in the collection window.
 
-		which means that we can retrieve up to the whole submitted
-		queue depth, but if none of IO has been completed yet, we will
-		NOT wait and immediately exit the system call. In this example
-		we simply do polling.
-
-iodepth_low=int	The low water mark indicating when to start filling
-		the queue again. Defaults to the same as iodepth, meaning
-		that fio will attempt to keep the queue full at all times.
-		If iodepth is set to eg 16 and iodepth_low is set to 4, then
-		after fio has filled the queue of 16 requests, it will let
-		the depth drain down to 4 before starting to fill it again.
-
-io_submit_mode=str	This option controls how fio submits the IO to
-		the IO engine. The default is 'inline', which means that the
-		fio job threads submit and reap IO directly. If set to
-		'offload', the job threads will offload IO submission to a
-		dedicated pool of IO threads. This requires some coordination
-		and thus has a bit of extra overhead, especially for lower
-		queue depth IO where it can increase latencies. The benefit
-		is that fio can manage submission rates independently of
-		the device completion rates. This avoids skewed latency
-		reporting if IO gets back up on the device side (the
-		coordinated omission problem).
-
-direct=bool	If value is true, use non-buffered io. This is usually
-		O_DIRECT. Note that ZFS on Solaris doesn't support direct io.
-		On Windows the synchronous ioengines don't support direct io.
-
-atomic=bool	If value is true, attempt to use atomic direct IO. Atomic
-		writes are guaranteed to be stable once acknowledged by
-		the operating system. Only Linux supports O_ATOMIC right
-		now.
-
-buffered=bool	If value is true, use buffered io. This is the opposite
-		of the 'direct' option. Defaults to true.
-
-offset=int	Start io at the given offset in the file. The data before
-		the given offset will not be touched. This effectively
-		caps the file size at real_size - offset.
-
-offset_increment=int	If this is provided, then the real offset becomes
-		offset + offset_increment * thread_number, where the thread
-		number is a counter that starts at 0 and is incremented for
-		each sub-job (i.e. when numjobs option is specified). This
-		option is useful if there are several jobs which are intended
-		to operate on a file in parallel disjoint segments, with
-		even spacing between the starting points.
-
-number_ios=int	Fio will normally perform IOs until it has exhausted the size
-		of the region set by size=, or if it exhaust the allocated
-		time (or hits an error condition). With this setting, the
-		range/size can be set independently of the number of IOs to
-		perform. When fio reaches this number, it will exit normally
-		and report status. Note that this does not extend the amount
-		of IO that will be done, it will only stop fio if this
-		condition is met before other end-of-job criteria.
-
-fsync=int	If writing to a file, issue a sync of the dirty data
-		for every number of blocks given. For example, if you give
-		32 as a parameter, fio will sync the file for every 32
-		writes issued. If fio is using non-buffered io, we may
-		not sync the file. The exception is the sg io engine, which
-		synchronizes the disk cache anyway.
-
-fdatasync=int	Like fsync= but uses fdatasync() to only sync data and not
-		metadata blocks.
-		In FreeBSD and Windows there is no fdatasync(), this falls back
-		to using fsync()
-
-sync_file_range=str:val	Use sync_file_range() for every 'val' number of
-		write operations. Fio will track range of writes that
-		have happened since the last sync_file_range() call. 'str'
-		can currently be one or more of:
-
-		wait_before	SYNC_FILE_RANGE_WAIT_BEFORE
-		write		SYNC_FILE_RANGE_WRITE
-		wait_after	SYNC_FILE_RANGE_WAIT_AFTER
-
-		So if you do sync_file_range=wait_before,write:8, fio would
-		use SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE for
-		every 8 writes. Also see the sync_file_range(2) man page.
-		This option is Linux specific.
-
-overwrite=bool	If true, writes to a file will always overwrite existing
-		data. If the file doesn't already exist, it will be
-		created before the write phase begins. If the file exists
-		and is large enough for the specified write phase, nothing
-		will be done.
-
-end_fsync=bool	If true, fsync file contents when a write stage has completed.
-
-fsync_on_close=bool	If true, fio will fsync() a dirty file on close.
-		This differs from end_fsync in that it will happen on every
-		file close, not just at the end of the job.
-
-rwmixread=int	How large a percentage of the mix should be reads.
-
-rwmixwrite=int	How large a percentage of the mix should be writes. If both
-		rwmixread and rwmixwrite is given and the values do not add
-		up to 100%, the latter of the two will be used to override
-		the first. This may interfere with a given rate setting,
-		if fio is asked to limit reads or writes to a certain rate.
-		If that is the case, then the distribution may be skewed.
-
-random_distribution=str:float	By default, fio will use a completely uniform
-		random distribution when asked to perform random IO. Sometimes
-		it is useful to skew the distribution in specific ways,
-		ensuring that some parts of the data is more hot than others.
-		fio includes the following distribution models:
-
-		random		Uniform random distribution
-		zipf		Zipf distribution
-		pareto		Pareto distribution
-		gauss		Normal (gaussian) distribution
-		zoned		Zoned random distribution
-
-		When using a zipf or pareto distribution, an input value
-		is also needed to define the access pattern. For zipf, this
-		is the zipf theta. For pareto, it's the pareto power. Fio
-		includes a test program, genzipf, that can be used visualize
-		what the given input values will yield in terms of hit rates.
-		If you wanted to use zipf with a theta of 1.2, you would use
-		random_distribution=zipf:1.2 as the option. If a non-uniform
-		model is used, fio will disable use of the random map. For
-		the gauss distribution, a normal deviation is supplied as
-		a value between 0 and 100.
-
-		For a zoned distribution, fio supports specifying percentages
-		of IO access that should fall within what range of the file or
-		device. For example, given a criteria of:
-
-			60% of accesses should be to the first 10%
-			30% of accesses should be to the next 20%
-			8% of accesses should be to to the next 30%
-			2% of accesses should be to the next 40%
-
-		we can define that through zoning of the random accesses. For
-		the above example, the user would do:
-
-			random_distribution=zoned:60/10:30/20:8/30:2/40
-
-		similarly to how bssplit works for setting ranges and
-		percentages of block sizes. Like bssplit, it's possible to
-		specify separate zones for reads, writes, and trims. If just
-		one set is given, it'll apply to all of them.
-
-percentage_random=int	For a random workload, set how big a percentage should
-		be random. This defaults to 100%, in which case the workload
-		is fully random. It can be set from anywhere from 0 to 100.
-		Setting it to 0 would make the workload fully sequential. Any
-		setting in between will result in a random mix of sequential
-		and random IO, at the given percentages. It is possible to
-		set different values for reads, writes, and trim. To do so,
-		simply use a comma separated list. See blocksize.
-
-norandommap	Normally fio will cover every block of the file when doing
-		random IO. If this option is given, fio will just get a
-		new random offset without looking at past io history. This
-		means that some blocks may not be read or written, and that
-		some blocks may be read/written more than once. If this option
-		is used with verify= and multiple blocksizes (via bsrange=),
-		only intact blocks are verified, i.e., partially-overwritten
-		blocks are ignored.
-
-softrandommap=bool See norandommap. If fio runs with the random block map
-		enabled and it fails to allocate the map, if this option is
-		set it will continue without a random block map. As coverage
-		will not be as complete as with random maps, this option is
-		disabled by default.
-
-random_generator=str	Fio supports the following engines for generating
-		IO offsets for random IO:
-
-		tausworthe	Strong 2^88 cycle random number generator
-		lfsr		Linear feedback shift register generator
-		tausworthe64	Strong 64-bit 2^258 cycle random number
-				generator
-
-		Tausworthe is a strong random number generator, but it
-		requires tracking on the side if we want to ensure that
-		blocks are only read or written once. LFSR guarantees
-		that we never generate the same offset twice, and it's
-		also less computationally expensive. It's not a true
-		random generator, however, though for IO purposes it's
-		typically good enough. LFSR only works with single
-		block sizes, not with workloads that use multiple block
-		sizes. If used with such a workload, fio may read or write
-		some blocks multiple times. The default value is tausworthe,
-		unless the required space exceeds 2^32 blocks. If it does,
-		then tausworthe64 is selected automatically.
-
-nice=int	Run the job with the given nice value. See man nice(2).
-
-     On Windows, values less than -15 set the process class to "High";
-     -1 through -15 set "Above Normal"; 1 through 15 "Below Normal";
-     and above 15 "Idle" priority class.
-
-prio=int	Set the io priority value of this job. Linux limits us to
-		a positive value between 0 and 7, with 0 being the highest.
-		See man ionice(1). Refer to an appropriate manpage for
-		other operating systems since meaning of priority may differ.
-
-prioclass=int	Set the io priority class. See man ionice(1).
-
-thinktime=int	Stall the job x microseconds after an io has completed before
-		issuing the next. May be used to simulate processing being
-		done by an application. See thinktime_blocks and
-		thinktime_spin.
-
-thinktime_spin=int
-		Only valid if thinktime is set - pretend to spend CPU time
-		doing something with the data received, before falling back
-		to sleeping for the rest of the period specified by
-		thinktime.
-
-thinktime_blocks=int
-		Only valid if thinktime is set - control how many blocks
-		to issue, before waiting 'thinktime' usecs. If not set,
-		defaults to 1 which will make fio wait 'thinktime' usecs
-		after every block. This effectively makes any queue depth
-		setting redundant, since no more than 1 IO will be queued
-		before we have to complete it and do our thinktime. In
-		other words, this setting effectively caps the queue depth
-		if the latter is larger.
-
-rate=int	Cap the bandwidth used by this job. The number is in bytes/sec,
-		the normal suffix rules apply. You can use rate=500k to limit
-		reads and writes to 500k each, or you can specify read and
-		writes separately. Using rate=1m,500k would limit reads to
-		1MB/sec and writes to 500KB/sec. Capping only reads or
-		writes can be done with rate=,500k or rate=500k,. The former
-		will only limit writes (to 500KB/sec), the latter will only
-		limit reads.
-
-rate_min=int	Tell fio to do whatever it can to maintain at least this
-		bandwidth. Failing to meet this requirement, will cause
-		the job to exit. The same format as rate is used for
-		read vs write separation.
-
-rate_iops=int	Cap the bandwidth to this number of IOPS. Basically the same
-		as rate, just specified independently of bandwidth. If the
-		job is given a block size range instead of a fixed value,
-		the smallest block size is used as the metric. The same format
-		as rate is used for read vs write separation.
-
-rate_iops_min=int If fio doesn't meet this rate of IO, it will cause
-		the job to exit. The same format as rate is used for read vs
-		write separation.
-
-rate_process=str	This option controls how fio manages rated IO
-		submissions. The default is 'linear', which submits IO in a
-		linear fashion with fixed delays between IOs that gets
-		adjusted based on IO completion rates. If this is set to
-		'poisson', fio will submit IO based on a more real world
-		random request flow, known as the Poisson process
-		(https://en.wikipedia.org/wiki/Poisson_process). The lambda
-		will be 10^6 / IOPS for the given workload.
-
-latency_target=int	If set, fio will attempt to find the max performance
-		point that the given workload will run at while maintaining a
-		latency below this target. The values is given in microseconds.
-		See latency_window and latency_percentile
-
-latency_window=int	Used with latency_target to specify the sample window
-		that the job is run at varying queue depths to test the
-		performance. The value is given in microseconds.
-
-latency_percentile=float	The percentage of IOs that must fall within the
-		criteria specified by latency_target and latency_window. If not
-		set, this defaults to 100.0, meaning that all IOs must be equal
-		or below to the value set by latency_target.
-
-max_latency=int	If set, fio will exit the job if it exceeds this maximum
-		latency. It will exit with an ETIME error.
-
-rate_cycle=int	Average bandwidth for 'rate' and 'rate_min' over this number
-		of milliseconds.
-
-cpumask=int	Set the CPU affinity of this job. The parameter given is a
-		bitmask of allowed CPU's the job may run on. So if you want
-		the allowed CPUs to be 1 and 5, you would pass the decimal
-		value of (1 << 1 | 1 << 5), or 34. See man
-		sched_setaffinity(2). This may not work on all supported
-		operating systems or kernel versions. This option doesn't
-		work well for a higher CPU count than what you can store in
-		an integer mask, so it can only control cpus 1-32. For
-		boxes with larger CPU counts, use cpus_allowed.
-
-cpus_allowed=str Controls the same options as cpumask, but it allows a text
-		setting of the permitted CPUs instead. So to use CPUs 1 and
-		5, you would specify cpus_allowed=1,5. This options also
-		allows a range of CPUs. Say you wanted a binding to CPUs
-		1, 5, and 8-15, you would set cpus_allowed=1,5,8-15.
-
-cpus_allowed_policy=str Set the policy of how fio distributes the CPUs
-		specified by cpus_allowed or cpumask. Two policies are
-		supported:
-
-		shared	All jobs will share the CPU set specified.
-		split	Each job will get a unique CPU from the CPU set.
-
-		'shared' is the default behaviour, if the option isn't
-		specified. If split is specified, then fio will will assign
-		one cpu per job. If not enough CPUs are given for the jobs
-		listed, then fio will roundrobin the CPUs in the set.
-
-numa_cpu_nodes=str Set this job running on specified NUMA nodes' CPUs. The
-		arguments allow comma delimited list of cpu numbers,
-		A-B ranges, or 'all'. Note, to enable numa options support,
-		fio must be built on a system with libnuma-dev(el) installed.
-
-numa_mem_policy=str Set this job's memory policy and corresponding NUMA
-		nodes. Format of the arguments:
-			<mode>[:<nodelist>]
-		`mode' is one of the following memory policy:
-			default, prefer, bind, interleave, local
-		For `default' and `local' memory policy, no node is
-		needed to be specified.
-		For `prefer', only one node is allowed.
-		For `bind' and `interleave', it allow comma delimited
-		list of numbers, A-B ranges, or 'all'.
-
-startdelay=time	Start this job the specified number of seconds after fio
-		has started. Only useful if the job file contains several
-		jobs, and you want to delay starting some jobs to a certain
-		time.
-
-runtime=time	Tell fio to terminate processing after the specified number
-		of seconds. It can be quite hard to determine for how long
-		a specified job will run, so this parameter is handy to
-		cap the total runtime to a given time.
-
-time_based	If set, fio will run for the duration of the runtime
-		specified even if the file(s) are completely read or
-		written. It will simply loop over the same workload
-		as many times as the runtime allows.
-
-ramp_time=time	If set, fio will run the specified workload for this amount
-		of time before logging any performance numbers. Useful for
-		letting performance settle before logging results, thus
-		minimizing the runtime required for stable results. Note
-		that the ramp_time is considered lead in time for a job,
-		thus it will increase the total runtime if a special timeout
-		or runtime is specified.
-
-steadystate=str:float
-ss=str:float	Define the criterion and limit for assessing steady state
-		performance. The first parameter designates the criterion
-		whereas the second parameter sets the threshold. When the
-		criterion falls below the threshold for the specified duration,
-		the job will stop. For example, iops_slope:0.1% will direct fio
-		to terminate the job when the least squares regression slope
-		falls below 0.1% of the mean IOPS. If group_reporting is
-		enabled this will apply to all jobs in the group. Below is the
-		list of available steady state assessment criteria. All
-		assessments are carried out using only data from the rolling
-		collection window. Threshold limits can be expressed as a fixed
-		value or as a percentage of the mean in the collection window.
-			iops	Collect IOPS data. Stop the job if all
-				individual IOPS measurements are within the
-				specified limit of the mean IOPS (e.g., iops:2
-				means that all individual IOPS values must be
-				within 2 of the mean, whereas iops:0.2% means
-				that all individual IOPS values must be within
-				0.2% of	the mean IOPS to terminate the job).
-			iops_slope
-				Collect IOPS data and calculate the least
-				squares regression slope. Stop the job if the
-				slope falls below the specified limit.
-			bw	Collect bandwidth data. Stop the job if all
-				individual bandwidth measurements are within
-				the specified limit of the mean bandwidth.
-			bw_slope
-				Collect bandwidth data and calculate the least
-				squares regression slope. Stop the job if the
-				slope falls below the specified limit.
-
-steadystate_duration=time
-ss_dur=time	A rolling window of this duration will be used to judge whether
-		steady state has been reached. Data will be collected once per
-		second. The default is 0 which disables steady state detection.
-
-steadystate_ramp_time=time
-ss_ramp=time	Allow the job to run for the specified duration before
-		beginning data collection for checking the steady state job
-		termination criterion. The default is 0.
-
-invalidate=bool	Invalidate the buffer/page cache parts for this file prior
-		to starting io. Defaults to true.
-
-sync=bool	Use sync io for buffered writes. For the majority of the
-		io engines, this means using O_SYNC.
-
-iomem=str
-mem=str		Fio can use various types of memory as the io unit buffer.
-		The allowed values are:
-
-			malloc	Use memory from malloc(3) as the buffers.
-				Default memory type.
-
-			shm	Use shared memory as the buffers. Allocated
-				through shmget(2).
-
-			shmhuge	Same as shm, but use huge pages as backing.
-
-			mmap	Use mmap to allocate buffers. May either be
-				anonymous memory, or can be file backed if
-				a filename is given after the option. The
-				format is mem=mmap:/path/to/file.
-
-			mmaphuge Use a memory mapped huge file as the buffer
-				backing. Append filename after mmaphuge, ala
-				mem=mmaphuge:/hugetlbfs/file
-
-			mmapshared	Same as mmap, but use a MMAP_SHARED
-				mapping.
-
-		The area allocated is a function of the maximum allowed
-		bs size for the job, multiplied by the io depth given. Note
-		that for shmhuge and mmaphuge to work, the system must have
-		free huge pages allocated. This can normally be checked
-		and set by reading/writing /proc/sys/vm/nr_hugepages on a
-		Linux system. Fio assumes a huge page is 4MB in size. So
-		to calculate the number of huge pages you need for a given
-		job file, add up the io depth of all jobs (normally one unless
-		iodepth= is used) and multiply by the maximum bs set. Then
-		divide that number by the huge page size. You can see the
-		size of the huge pages in /proc/meminfo. If no huge pages
-		are allocated by having a non-zero number in nr_hugepages,
-		using mmaphuge or shmhuge will fail. Also see hugepage-size.
-
-		mmaphuge also needs to have hugetlbfs mounted and the file
-		location should point there. So if it's mounted in /huge,
-		you would use mem=mmaphuge:/huge/somefile.
-
-iomem_align=int	This indicates the memory alignment of the IO memory buffers.
-		Note that the given alignment is applied to the first IO unit
-		buffer, if using iodepth the alignment of the following buffers
-		are given by the bs used. In other words, if using a bs that is
-		a multiple of the page sized in the system, all buffers will
-		be aligned to this value. If using a bs that is not page
-		aligned, the alignment of subsequent IO memory buffers is the
-		sum of the iomem_align and bs used.
-
-hugepage-size=int
-		Defines the size of a huge page. Must at least be equal
-		to the system setting, see /proc/meminfo. Defaults to 4MB.
-		Should probably always be a multiple of megabytes, so using
-		hugepage-size=Xm is the preferred way to set this to avoid
-		setting a non-pow-2 bad value.
-
-exitall		When one job finishes, terminate the rest. The default is
-		to wait for each job to finish, sometimes that is not the
-		desired action.
-
-exitall_on_error	When one job finishes in error, terminate the rest. The
-		default is to wait for each job to finish.
-
-bwavgtime=int	Average the calculated bandwidth over the given time. Value
-		is specified in milliseconds. If the job also does bandwidth
-		logging through 'write_bw_log', then the minimum of this option
-		and 'log_avg_msec' will be used.  Default: 500ms.
-
-iopsavgtime=int	Average the calculated IOPS over the given time. Value
-		is specified in milliseconds. If the job also does IOPS logging
-		through 'write_iops_log', then the minimum of this option and
-		'log_avg_msec' will be used.  Default: 500ms.
-
-create_serialize=bool	If true, serialize the file creation for the jobs.
-			This may be handy to avoid interleaving of data
-			files, which may greatly depend on the filesystem
-			used and even the number of processors in the system.
-
-create_fsync=bool	fsync the data file after creation. This is the
-			default.
-
-create_on_open=bool	Don't pre-setup the files for IO, just create open()
-			when it's time to do IO to that file.
-
-create_only=bool	If true, fio will only run the setup phase of the job.
-			If files need to be laid out or updated on disk, only
-			that will be done. The actual job contents are not
-			executed.
-
-allow_file_create=bool	If true, fio is permitted to create files as part
-		of its workload. This is the default behavior. If this
-		option is false, then fio will error out if the files it
-		needs to use don't already exist. Default: true.
-
-allow_mounted_write=bool	If this isn't set, fio will abort jobs that
-		are destructive (eg that write) to what appears to be a
-		mounted device or partition. This should help catch creating
-		inadvertently destructive tests, not realizing that the test
-		will destroy data on the mounted file system. Default: false.
-
-pre_read=bool	If this is given, files will be pre-read into memory before
-		starting the given IO operation. This will also clear
-		the 'invalidate' flag, since it is pointless to pre-read
-		and then drop the cache. This will only work for IO engines
-		that are seek-able, since they allow you to read the same data
-		multiple times. Thus it will not work on eg network or splice
-		IO.
-
-unlink=bool	Unlink the job files when done. Not the default, as repeated
-		runs of that job would then waste time recreating the file
-		set again and again.
-
-unlink_each_loop=bool	Unlink job files after each iteration or loop.
-
-loops=int	Run the specified number of iterations of this job. Used
-		to repeat the same workload a given number of times. Defaults
-		to 1.
-
-verify_only	Do not perform specified workload---only verify data still
-		matches previous invocation of this workload. This option
-		allows one to check data multiple times at a later date
-		without overwriting it. This option makes sense only for
-		workloads that write data, and does not support workloads
-		with the time_based option set.
-
-do_verify=bool	Run the verify phase after a write phase. Only makes sense if
-		verify is set. Defaults to 1.
-
-verify=str	If writing to a file, fio can verify the file contents
-		after each iteration of the job. Each verification method also implies
-		verification of special header, which is written to the beginning of
-		each block. This header also includes meta information, like offset
-		of the block, block number, timestamp when block was written, etc.
-		verify=str can be combined with verify_pattern=str option.
-		The allowed values are:
-
-			md5	Use an md5 sum of the data area and store
-				it in the header of each block.
-
-			crc64	Use an experimental crc64 sum of the data
-				area and store it in the header of each
-				block.
-
-			crc32c	Use a crc32c sum of the data area and store
-				it in the header of each block.
-
-			crc32c-intel Use hardware assisted crc32c calculation
-				provided on SSE4.2 enabled processors. Falls
-				back to regular software crc32c, if not
-				supported by the system.
-
-			crc32	Use a crc32 sum of the data area and store
-				it in the header of each block.
-
-			crc16	Use a crc16 sum of the data area and store
-				it in the header of each block.
-
-			crc7	Use a crc7 sum of the data area and store
-				it in the header of each block.
-
-			xxhash	Use xxhash as the checksum function. Generally
-				the fastest software checksum that fio
-				supports.
-
-			sha512	Use sha512 as the checksum function.
-
-			sha256	Use sha256 as the checksum function.
-
-			sha1	Use optimized sha1 as the checksum function.
-
-			meta	This option is deprecated, since now meta information is
-				included in generic verification header and meta verification
-				happens by default. For detailed information see the description
-				of the verify=str setting. This option is kept because of
-				compatibility's sake with old configurations. Do not use it.
-
-			pattern	Verify a strict pattern. Normally fio includes
-				a header with some basic information and
-				checksumming, but if this option is set, only
-				the specific pattern set with 'verify_pattern'
-				is verified.
-
-			null	Only pretend to verify. Useful for testing
-				internals with ioengine=null, not for much
-				else.
-
-		This option can be used for repeated burn-in tests of a
-		system to make sure that the written data is also
-		correctly read back. If the data direction given is
-		a read or random read, fio will assume that it should
-		verify a previously written file. If the data direction
-		includes any form of write, the verify will be of the
-		newly written data.
-
-verifysort=bool	If set, fio will sort written verify blocks when it deems
-		it faster to read them back in a sorted manner. This is
-		often the case when overwriting an existing file, since
-		the blocks are already laid out in the file system. You
-		can ignore this option unless doing huge amounts of really
-		fast IO where the red-black tree sorting CPU time becomes
-		significant.
-
-verify_offset=int	Swap the verification header with data somewhere else
-			in the block before writing. Its swapped back before
-			verifying.
-
-verify_interval=int	Write the verification header at a finer granularity
-			than the blocksize. It will be written for chunks the
-			size of header_interval. blocksize should divide this
-			evenly.
-
-verify_pattern=str	If set, fio will fill the io buffers with this
-		pattern. Fio defaults to filling with totally random
-		bytes, but sometimes it's interesting to fill with a known
-		pattern for io verification purposes. Depending on the
-		width of the pattern, fio will fill 1/2/3/4 bytes of the
-		buffer at the time(it can be either a decimal or a hex number).
-		The verify_pattern if larger than a 32-bit quantity has to
-		be a hex number that starts with either "0x" or "0X". Use
-		with verify=str. Also, verify_pattern supports %o format,
-		which means that for each block offset will be written and
-		then verified back, e.g.:
+		**iops**
+			Collect IOPS data. Stop the job if all individual IOPS measurements
+			are within the specified limit of the mean IOPS (e.g., ``iops:2``
+			means that all individual IOPS values must be within 2 of the mean,
+			whereas ``iops:0.2%`` means that all individual IOPS values must be
+			within 0.2% of the mean IOPS to terminate the job).
 
-		verify_pattern=%o
+		**iops_slope**
+			Collect IOPS data and calculate the least squares regression
+			slope. Stop the job if the slope falls below the specified limit.
 
-		Or use combination of everything:
-		verify_pattern=0xff%o"abcd"-12
+		**bw**
+			Collect bandwidth data. Stop the job if all individual bandwidth
+			measurements are within the specified limit of the mean bandwidth.
+
+		**bw_slope**
+			Collect bandwidth data and calculate the least squares regression
+			slope. Stop the job if the slope falls below the specified limit.
+
+.. option:: steadystate_duration=time, ss_dur=time
 
-verify_fatal=bool	Normally fio will keep checking the entire contents
-		before quitting on a block verification failure. If this
-		option is set, fio will exit the job on the first observed
-		failure.
-
-verify_dump=bool	If set, dump the contents of both the original data
-		block and the data block we read off disk to files. This
-		allows later analysis to inspect just what kind of data
-		corruption occurred. Off by default.
-
-verify_async=int	Fio will normally verify IO inline from the submitting
-		thread. This option takes an integer describing how many
-		async offload threads to create for IO verification instead,
-		causing fio to offload the duty of verifying IO contents
-		to one or more separate threads. If using this offload
-		option, even sync IO engines can benefit from using an
-		iodepth setting higher than 1, as it allows them to have
-		IO in flight while verifies are running.
-
-verify_async_cpus=str	Tell fio to set the given CPU affinity on the
-		async IO verification threads. See cpus_allowed for the
-		format used.
-
-verify_backlog=int	Fio will normally verify the written contents of a
-		job that utilizes verify once that job has completed. In
-		other words, everything is written then everything is read
-		back and verified. You may want to verify continually
-		instead for a variety of reasons. Fio stores the meta data
-		associated with an IO block in memory, so for large
-		verify workloads, quite a bit of memory would be used up
-		holding this meta data. If this option is enabled, fio
-		will write only N blocks before verifying these blocks.
-
-verify_backlog_batch=int	Control how many blocks fio will verify
-		if verify_backlog is set. If not set, will default to
-		the value of verify_backlog (meaning the entire queue
-		is read back and verified).  If verify_backlog_batch is
-		less than verify_backlog then not all blocks will be verified,
-		if verify_backlog_batch is larger than verify_backlog, some
-		blocks will be verified more than once.
-
-verify_state_save=bool	When a job exits during the write phase of a verify
-		workload, save its current state. This allows fio to replay
-		up until that point, if the verify state is loaded for the
-		verify read phase. The format of the filename is, roughly,
-		<type>-<jobname>-<jobindex>-verify.state. <type> is "local"
-		for a local run, "sock" for a client/server socket connection,
-		and "ip" (192.168.0.1, for instance) for a networked
-		client/server connection.
-
-verify_state_load=bool	If a verify termination trigger was used, fio stores
-		the current write state of each thread. This can be used at
-		verification time so that fio knows how far it should verify.
-		Without this information, fio will run a full verification
-		pass, according to the settings in the job file used.
-
-stonewall
-wait_for_previous Wait for preceding jobs in the job file to exit, before
-		starting this one. Can be used to insert serialization
-		points in the job file. A stone wall also implies starting
-		a new reporting group.
-
-new_group	Start a new reporting group. See: group_reporting.
-
-numjobs=int	Create the specified number of clones of this job. May be
-		used to setup a larger number of threads/processes doing
-		the same thing. Each thread is reported separately; to see
-		statistics for all clones as a whole, use group_reporting in
-		conjunction with new_group.
-
-group_reporting	It may sometimes be interesting to display statistics for
-		groups of jobs as a whole instead of for each individual job.
-		This is especially true if 'numjobs' is used; looking at
-		individual thread/process output quickly becomes unwieldy.
-		To see the final report per-group instead of per-job, use
-		'group_reporting'. Jobs in a file will be part of the same
-		reporting group, unless if separated by a stonewall, or by
-		using 'new_group'.
-
-thread		fio defaults to forking jobs, however if this option is
-		given, fio will use pthread_create(3) to create threads
-		instead.
-
-zonesize=int	Divide a file into zones of the specified size. See zoneskip.
-
-zoneskip=int	Skip the specified number of bytes when zonesize data has
-		been read. The two zone options can be used to only do
-		io on zones of a file.
-
-write_iolog=str	Write the issued io patterns to the specified file. See
-		read_iolog.  Specify a separate file for each job, otherwise
-		the iologs will be interspersed and the file may be corrupt.
-
-read_iolog=str	Open an iolog with the specified file name and replay the
-		io patterns it contains. This can be used to store a
-		workload and replay it sometime later. The iolog given
-		may also be a blktrace binary file, which allows fio
-		to replay a workload captured by blktrace. See blktrace
-		for how to capture such logging data. For blktrace replay,
-		the file needs to be turned into a blkparse binary data
-		file first (blkparse <device> -o /dev/null -d file_for_fio.bin).
-
-replay_no_stall=int When replaying I/O with read_iolog the default behavior
-		is to attempt to respect the time stamps within the log and
-		replay them with the appropriate delay between IOPS. By
-		setting this variable fio will not respect the timestamps and
-		attempt to replay them as fast as possible while still
-		respecting ordering. The result is the same I/O pattern to a
-		given device, but different timings.
-
-replay_redirect=str While replaying I/O patterns using read_iolog the
-		default behavior is to replay the IOPS onto the major/minor
-		device that each IOP was recorded from.  This is sometimes
-		undesirable because on a different machine those major/minor
-		numbers can map to a different device.  Changing hardware on
-		the same system can also result in a different major/minor
-		mapping.  Replay_redirect causes all IOPS to be replayed onto
-		the single specified device regardless of the device it was
-		recorded from. i.e. replay_redirect=/dev/sdc would cause all
-		IO in the blktrace or iolog to be replayed onto /dev/sdc.
-		This means multiple devices will be replayed onto a single
-		device, if the trace contains multiple devices. If you want
-		multiple devices to be replayed concurrently to multiple
-		redirected devices you must blkparse your trace into separate
-		traces and replay them with independent fio invocations.
-		Unfortunately this also breaks the strict time ordering
-		between multiple device accesses.
-
-replay_align=int	Force alignment of IO offsets and lengths in a trace
-		to this power of 2 value.
-
-replay_scale=int	Scale sector offsets down by this factor when
-		replaying traces.
-
-per_job_logs=bool	If set, this generates bw/clat/iops log with per
-		file private filenames. If not set, jobs with identical names
-		will share the log filename. Default: true.
-
-write_bw_log=str If given, write a bandwidth log of the jobs in this job
-		file. Can be used to store data of the bandwidth of the
-		jobs in their lifetime. The included fio_generate_plots
-		script uses gnuplot to turn these text files into nice
-		graphs. See write_lat_log for behaviour of given
-		filename. For this option, the suffix is _bw.x.log, where
-		x is the index of the job (1..N, where N is the number of
-		jobs). If 'per_job_logs' is false, then the filename will not
-		include the job index. See 'Log File Formats'.
-
-write_lat_log=str Same as write_bw_log, except that this option stores io
-		submission, completion, and total latencies instead. If no
-		filename is given with this option, the default filename of
-		"jobname_type.log" is used. Even if the filename is given,
-		fio will still append the type of log. So if one specifies
+	A rolling window of this duration will be used to judge whether steady state
+	has been reached. Data will be collected once per second. The default is 0
+	which disables steady state detection.  When the unit is omitted, the
+	value is interpreted in seconds.
+
+.. option:: steadystate_ramp_time=time, ss_ramp=time
+
+	Allow the job to run for the specified duration before beginning data
+	collection for checking the steady state job termination criterion. The
+	default is 0.  When the unit is omitted, the value is interpreted in seconds.
+
+
+Measurements and reporting
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: per_job_logs=bool
+
+	If set, this generates bw/clat/iops log with per file private filenames. If
+	not set, jobs with identical names will share the log filename. Default:
+	true.
+
+.. option:: group_reporting
+
+	It may sometimes be interesting to display statistics for groups of jobs as
+	a whole instead of for each individual job.  This is especially true if
+	:option:`numjobs` is used; looking at individual thread/process output
+	quickly becomes unwieldy.  To see the final report per-group instead of
+	per-job, use :option:`group_reporting`. Jobs in a file will be part of the
+	same reporting group, unless if separated by a :option:`stonewall`, or by
+	using :option:`new_group`.
+
+.. option:: new_group
+
+	Start a new reporting group. See: :option:`group_reporting`.  If not given,
+	all jobs in a file will be part of the same reporting group, unless
+	separated by a :option:`stonewall`.
+
+.. option:: stats=bool
+
+	By default, fio collects and shows final output results for all jobs
+	that run. If this option is set to 0, then fio will ignore it in
+	the final stat output.
+
+.. option:: write_bw_log=str
+
+	If given, write a bandwidth log for this job. Can be used to store data of
+	the bandwidth of the jobs in their lifetime. The included
+	:command:`fio_generate_plots` script uses :command:`gnuplot` to turn these
+	text files into nice graphs. See :option:`write_lat_log` for behavior of
+	given filename. For this option, the postfix is :file:`_bw.x.log`, where `x`
+	is the index of the job (`1..N`, where `N` is the number of jobs). If
+	:option:`per_job_logs` is false, then the filename will not include the job
+	index.  See `Log File Formats`_.
+
+.. option:: write_lat_log=str
+
+	Same as :option:`write_bw_log`, except that this option stores I/O
+	submission, completion, and total latencies instead. If no filename is given
+	with this option, the default filename of :file:`jobname_type.log` is
+	used. Even if the filename is given, fio will still append the type of
+	log. So if one specifies::
 
 		write_lat_log=foo
 
-		The actual log names will be foo_slat.x.log, foo_clat.x.log,
-		and foo_lat.x.log, where x is the index of the job (1..N,
-		where N is the number of jobs). This helps fio_generate_plot
-		find the logs automatically. If 'per_job_logs' is false, then
-		the filename will not include the job index. See 'Log File
-		Formats'.
-
-write_hist_log=str Same as write_lat_log, but writes I/O completion
-		latency histograms. If no filename is given with this option, the
-		default filename of "jobname_clat_hist.x.log" is used, where x is
-		the index of the job (1..N, where N is the number of jobs). Even
-		if the filename is given, fio will still append the type of log.
-		If per_job_logs is false, then the filename will not include the
-		job index. See 'Log File Formats'.
-
-write_iops_log=str Same as write_bw_log, but writes IOPS. If no filename is
-		given with this option, the default filename of
-		"jobname_type.x.log" is used,where x is the index of the job
-		(1..N, where N is the number of jobs). Even if the filename
-		is given, fio will still append the type of log. If
-		'per_job_logs' is false, then the filename will not include
-		the job index. See 'Log File Formats'.
-
-log_avg_msec=int By default, fio will log an entry in the iops, latency,
-		or bw log for every IO that completes. When writing to the
-		disk log, that can quickly grow to a very large size. Setting
-		this option makes fio average the each log entry over the
-		specified period of time, reducing the resolution of the log.
-		See log_max_value as well. Defaults to 0, logging all entries.
-
-log_hist_msec=int Same as log_avg_msec, but logs entries for completion
-		latency histograms. Computing latency percentiles from averages of
-		intervals using log_avg_msec is innacurate. Setting this option makes
-		fio log histogram entries over the specified period of time, reducing
-		log sizes for high IOPS devices while retaining percentile accuracy.
-		See log_hist_coarseness as well. Defaults to 0, meaning histogram
-		logging is disabled.
-
-log_hist_coarseness=int Integer ranging from 0 to 6, defining the coarseness
-		of the resolution of the histogram logs enabled with log_hist_msec. For
-		each increment in coarseness, fio outputs half as many bins. Defaults to
-		0, for which histogram logs contain 1216 latency bins. See
-		'Log File Formats'.
-
-log_max_value=bool	If log_avg_msec is set, fio logs the average over that
-		window. If you instead want to log the maximum value, set this
-		option to 1. Defaults to 0, meaning that averaged values are
-		logged.
-
-log_offset=int	If this is set, the iolog options will include the byte
-		offset for the IO entry as well as the other data values.
-
-log_compression=int	If this is set, fio will compress the IO logs as
-		it goes, to keep the memory footprint lower. When a log
-		reaches the specified size, that chunk is removed and
-		compressed in the background. Given that IO logs are
-		fairly highly compressible, this yields a nice memory
-		savings for longer runs. The downside is that the
-		compression will consume some background CPU cycles, so
-		it may impact the run. This, however, is also true if
-		the logging ends up consuming most of the system memory.
-		So pick your poison. The IO logs are saved normally at the
-		end of a run, by decompressing the chunks and storing them
-		in the specified log file. This feature depends on the
-		availability of zlib.
-
-log_compression_cpus=str	Define the set of CPUs that are allowed to
-		handle online log compression for the IO jobs. This can
-		provide better isolation between performance sensitive jobs,
-		and background compression work.
-
-log_store_compressed=bool	If set, fio will store the log files in a
-		compressed format. They can be decompressed with fio, using
-		the --inflate-log command line parameter. The files will be
-		stored with a .fz suffix.
-
-log_unix_epoch=bool	If set, fio will log Unix timestamps to the log
-		files produced by enabling write_type_log for each log type, instead
-		of the default zero-based timestamps.
-
-block_error_percentiles=bool	If set, record errors in trim block-sized
-		units from writes and trims and output a histogram of
-		how many trims it took to get to errors, and what kind
-		of error was encountered.
-
-lockmem=int	Pin down the specified amount of memory with mlock(2). Can
-		potentially be used instead of removing memory or booting
-		with less memory to simulate a smaller amount of memory.
-		The amount specified is per worker.
-
-exec_prerun=str	Before running this job, issue the command specified
-		through system(3). Output is redirected in a file called
-		jobname.prerun.txt.
-
-exec_postrun=str After the job completes, issue the command specified
-		 though system(3). Output is redirected in a file called
-		 jobname.postrun.txt.
-
-ioscheduler=str	Attempt to switch the device hosting the file to the specified
-		io scheduler before running.
-
-disk_util=bool	Generate disk utilization statistics, if the platform
-		supports it. Defaults to on.
-
-disable_lat=bool Disable measurements of total latency numbers. Useful
-		only for cutting back the number of calls to gettimeofday,
-		as that does impact performance at really high IOPS rates.
-		Note that to really get rid of a large amount of these
-		calls, this option must be used with disable_slat and
-		disable_bw as well.
-
-disable_clat=bool Disable measurements of completion latency numbers. See
-		disable_lat.
-
-disable_slat=bool Disable measurements of submission latency numbers. See
-		disable_slat.
-
-disable_bw=bool	Disable measurements of throughput/bandwidth numbers. See
-		disable_lat.
-
-clat_percentiles=bool Enable the reporting of percentiles of
-		 completion latencies.
-
-percentile_list=float_list Overwrite the default list of percentiles
-		for completion latencies and the block error histogram.
-		Each number is a floating number in the range (0,100],
-		and the maximum length of the list is 20. Use ':'
-		to separate the numbers, and list the numbers in ascending
-		order. For example, --percentile_list=99.5:99.9 will cause
-		fio to report the values of completion latency below which
-		99.5% and 99.9% of the observed latencies fell, respectively.
-
-clocksource=str	Use the given clocksource as the base of timing. The
-		supported options are:
-
-			gettimeofday	gettimeofday(2)
-
-			clock_gettime	clock_gettime(2)
-
-			cpu		Internal CPU clock source
-
-		cpu is the preferred clocksource if it is reliable, as it
-		is very fast (and fio is heavy on time calls). Fio will
-		automatically use this clocksource if it's supported and
-		considered reliable on the system it is running on, unless
-		another clocksource is specifically set. For x86/x86-64 CPUs,
-		this means supporting TSC Invariant.
-
-gtod_reduce=bool Enable all of the gettimeofday() reducing options
-		(disable_clat, disable_slat, disable_bw) plus reduce
-		precision of the timeout somewhat to really shrink
-		the gettimeofday() call count. With this option enabled,
-		we only do about 0.4% of the gtod() calls we would have
-		done if all time keeping was enabled.
-
-gtod_cpu=int	Sometimes it's cheaper to dedicate a single thread of
-		execution to just getting the current time. Fio (and
-		databases, for instance) are very intensive on gettimeofday()
-		calls. With this option, you can set one CPU aside for
-		doing nothing but logging current time to a shared memory
-		location. Then the other threads/processes that run IO
-		workloads need only copy that segment, instead of entering
-		the kernel with a gettimeofday() call. The CPU set aside
-		for doing these time calls will be excluded from other
-		uses. Fio will manually clear it from the CPU mask of other
-		jobs.
-
-continue_on_error=str	Normally fio will exit the job on the first observed
-		failure. If this option is set, fio will continue the job when
-		there is a 'non-fatal error' (EIO or EILSEQ) until the runtime
-		is exceeded or the I/O size specified is completed. If this
-		option is used, there are two more stats that are appended,
-		the total error count and the first error. The error field
-		given in the stats is the first error that was hit during the
-		run.
-
-		The allowed values are:
-
-			none	Exit on any IO or verify errors.
-
-			read	Continue on read errors, exit on all others.
-
-			write	Continue on write errors, exit on all others.
-
-			io	Continue on any IO error, exit on all others.
-
-			verify	Continue on verify errors, exit on all others.
-
-			all	Continue on all errors.
-
-			0		Backward-compatible alias for 'none'.
-
-			1		Backward-compatible alias for 'all'.
-
-ignore_error=str Sometimes you want to ignore some errors during test
-		 in that case you can specify error list for each error type.
-		 ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST
-		 errors for given error type is separated with ':'. Error
-		 may be symbol ('ENOSPC', 'ENOMEM') or integer.
-		 Example:
-			ignore_error=EAGAIN,ENOSPC:122
-		 This option will ignore EAGAIN from READ, and ENOSPC and
-		 122(EDQUOT) from WRITE.
-
-error_dump=bool If set dump every error even if it is non fatal, true
-		by default. If disabled only fatal error will be dumped
-
-cgroup=str	Add job to this control group. If it doesn't exist, it will
-		be created. The system must have a mounted cgroup blkio
-		mount point for this to work. If your system doesn't have it
-		mounted, you can do so with:
+	The actual log names will be :file:`foo_slat.x.log`, :file:`foo_clat.x.log`,
+	and :file:`foo_lat.x.log`, where `x` is the index of the job (`1..N`, where `N`
+	is the number of jobs). This helps :command:`fio_generate_plots` find the
+	logs automatically. If :option:`per_job_logs` is false, then the filename
+	will not include the job index.  See `Log File Formats`_.
 
-		# mount -t cgroup -o blkio none /cgroup
+.. option:: write_hist_log=str
 
-cgroup_weight=int	Set the weight of the cgroup to this value. See
-		the documentation that comes with the kernel, allowed values
-		are in the range of 100..1000.
-
-cgroup_nodelete=bool Normally fio will delete the cgroups it has created after
-		the job completion. To override this behavior and to leave
-		cgroups around after the job completion, set cgroup_nodelete=1.
-		This can be useful if one wants to inspect various cgroup
-		files after job completion. Default: false
-
-uid=int		Instead of running as the invoking user, set the user ID to
-		this value before the thread/process does any work.
-
-gid=int		Set group ID, see uid.
-
-flow_id=int	The ID of the flow. If not specified, it defaults to being a
-		global flow. See flow.
-
-flow=int	Weight in token-based flow control. If this value is used, then
-		there is a 'flow counter' which is used to regulate the
-		proportion of activity between two or more jobs. fio attempts
-		to keep this flow counter near zero. The 'flow' parameter
-		stands for how much should be added or subtracted to the flow
-		counter on each iteration of the main I/O loop. That is, if
-		one job has flow=8 and another job has flow=-1, then there
-		will be a roughly 1:8 ratio in how much one runs vs the other.
-
-flow_watermark=int	The maximum value that the absolute value of the flow
-		counter is allowed to reach before the job must wait for a
-		lower value of the counter.
+	Same as :option:`write_lat_log`, but writes I/O completion latency
+	histograms. If no filename is given with this option, the default filename
+	of :file:`jobname_clat_hist.x.log` is used, where `x` is the index of the
+	job (`1..N`, where `N` is the number of jobs). Even if the filename is given,
+	fio will still append the type of log.  If :option:`per_job_logs` is false,
+	then the filename will not include the job index. See `Log File Formats`_.
 
-flow_sleep=int	The period of time, in microseconds, to wait after the flow
-		watermark has been exceeded before retrying operations
+.. option:: write_iops_log=str
 
-In addition, there are some parameters which are only valid when a specific
-ioengine is in use. These are used identically to normal parameters, with the
-caveat that when used on the command line, they must come after the ioengine
-that defines them is selected.
-
-[libaio] userspace_reap Normally, with the libaio engine in use, fio will use
-		the io_getevents system call to reap newly returned events.
-		With this flag turned on, the AIO ring will be read directly
-		from user-space to reap events. The reaping mode is only
-		enabled when polling for a minimum of 0 events (eg when
-		iodepth_batch_complete=0).
-
-[psyncv2] hipri		Set RWF_HIPRI on IO, indicating to the kernel that
-			it's of higher priority than normal.
-
-[cpuio] cpuload=int Attempt to use the specified percentage of CPU cycles.
-
-[cpuio] cpuchunks=int Split the load into cycles of the given time. In
-		microseconds.
-
-[cpuio] exit_on_io_done=bool Detect when IO threads are done, then exit.
-
-[netsplice] hostname=str
-[net] hostname=str The host name or IP address to use for TCP or UDP based IO.
-		If the job is a TCP listener or UDP reader, the hostname is not
-		used and must be omitted unless it is a valid UDP multicast
-		address.
-[libhdfs] namenode=str The host name or IP address of a HDFS cluster namenode to contact.
-
-[netsplice] port=int
-[net] port=int	The TCP or UDP port to bind to or connect to. If this is used
-with numjobs to spawn multiple instances of the same job type, then this will
-be the starting port number since fio will use a range of ports.
-[libhdfs] port=int	the listening port of the HFDS cluster namenode.
-
-[netsplice] interface=str
-[net] interface=str  The IP address of the network interface used to send or
-		receive UDP multicast
-
-[netsplice] ttl=int
-[net] ttl=int	Time-to-live value for outgoing UDP multicast packets.
-		Default: 1
-
-[netsplice] nodelay=bool
-[net] nodelay=bool	Set TCP_NODELAY on TCP connections.
-
-[netsplice] protocol=str
-[netsplice] proto=str
-[net] protocol=str
-[net] proto=str	The network protocol to use. Accepted values are:
-
-			tcp	Transmission control protocol
-			tcpv6	Transmission control protocol V6
-			udp	User datagram protocol
-			udpv6	User datagram protocol V6
-			unix	UNIX domain socket
-
-		When the protocol is TCP or UDP, the port must also be given,
-		as well as the hostname if the job is a TCP listener or UDP
-		reader. For unix sockets, the normal filename option should be
-		used and the port is invalid.
-
-[net] listen	For TCP network connections, tell fio to listen for incoming
-		connections rather than initiating an outgoing connection. The
-		hostname must be omitted if this option is used.
-
-[net] pingpong	Normally a network writer will just continue writing data, and
-		a network reader will just consume packages. If pingpong=1
-		is set, a writer will send its normal payload to the reader,
-		then wait for the reader to send the same payload back. This
-		allows fio to measure network latencies. The submission
-		and completion latencies then measure local time spent
-		sending or receiving, and the completion latency measures
-		how long it took for the other end to receive and send back.
-		For UDP multicast traffic pingpong=1 should only be set for a
-		single reader when multiple readers are listening to the same
-		address.
-
-[net] window_size	Set the desired socket buffer size for the connection.
-
-[net] mss	Set the TCP maximum segment size (TCP_MAXSEG).
-
-[e4defrag] donorname=str
-	        File will be used as a block donor(swap extents between files)
-[e4defrag] inplace=int
-		Configure donor file blocks allocation strategy
-		0(default): Preallocate donor's file on init
-		1 	  : allocate space immediately inside defragment event,
-			    and free right after event
-
-[rbd] clustername=str	Specifies the name of the Ceph cluster.
-[rbd] rbdname=str	Specifies the name of the RBD.
-[rbd] pool=str		Specifies the name of the Ceph pool containing RBD.
-[rbd] clientname=str	Specifies the username (without the 'client.' prefix)
-			used to access the Ceph cluster. If the clustername is
-			specified, the clientname shall be the full type.id
-			string. If no type. prefix is given, fio will add
-			'client.' by default.
-
-[mtd] skip_bad=bool	Skip operations against known bad blocks.
-
-[libhdfs] hdfsdirectory	libhdfs will create chunk in this HDFS directory
-[libhdfs] chunk_size	the size of the chunk to use for each file.
-
-
-6.0 Interpreting the output
----------------------------
-
-fio spits out a lot of output. While running, fio will display the
-status of the jobs created. An example of that would be:
-
-Threads: 1: [_r] [24.8% done] [ 13509/  8334 kb/s] [eta 00h:01m:31s]
-
-The characters inside the square brackets denote the current status of
-each thread. The possible values (in typical life cycle order) are:
-
-Idle	Run
-----    ---
-P		Thread setup, but not started.
-C		Thread created.
-I		Thread initialized, waiting or generating necessary data.
-	p	Thread running pre-reading file(s).
-	R	Running, doing sequential reads.
-	r	Running, doing random reads.
-	W	Running, doing sequential writes.
-	w	Running, doing random writes.
-	M	Running, doing mixed sequential reads/writes.
-	m	Running, doing mixed random reads/writes.
-	F	Running, currently waiting for fsync()
-	f	Running, finishing up (writing IO logs, etc)
-	V	Running, doing verification of written data.
-E		Thread exited, not reaped by main thread yet.
-_		Thread reaped, or
-X		Thread reaped, exited with an error.
-K		Thread reaped, exited due to signal.
-
-Fio will condense the thread string as not to take up more space on the
-command line as is needed. For instance, if you have 10 readers and 10
-writers running, the output would look like this:
-
-Jobs: 20 (f=20): [R(10),W(10)] [4.0% done] [2103MB/0KB/0KB /s] [538K/0/0 iops] [eta 57m:36s]
-
-Fio will still maintain the ordering, though. So the above means that jobs
-1..10 are readers, and 11..20 are writers.
-
-The other values are fairly self explanatory - number of threads
-currently running and doing io, rate of io since last check (read speed
-listed first, then write speed), and the estimated completion percentage
-and time for the running group. It's impossible to estimate runtime of
-the following groups (if any). Note that the string is displayed in order,
-so it's possible to tell which of the jobs are currently doing what. The
-first character is the first job defined in the job file, and so forth.
-
-When fio is done (or interrupted by ctrl-c), it will show the data for
-each thread, group of threads, and disks in that order. For each data
-direction, the output looks like:
-
-Client1 (g=0): err= 0:
-  write: io=    32MB, bw=   666KB/s, iops=89 , runt= 50320msec
-    slat (msec): min=    0, max=  136, avg= 0.03, stdev= 1.92
-    clat (msec): min=    0, max=  631, avg=48.50, stdev=86.82
-    bw (KB/s) : min=    0, max= 1196, per=51.00%, avg=664.02, stdev=681.68
-  cpu        : usr=1.49%, sys=0.25%, ctx=7969, majf=0, minf=17
-  IO depths    : 1=0.1%, 2=0.3%, 4=0.5%, 8=99.0%, 16=0.0%, 32=0.0%, >32=0.0%
-     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
-     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
-     issued r/w: total=0/32768, short=0/0
-     lat (msec): 2=1.6%, 4=0.0%, 10=3.2%, 20=12.8%, 50=38.4%, 100=24.8%,
-     lat (msec): 250=15.2%, 500=0.0%, 750=0.0%, 1000=0.0%, >=2048=0.0%
-
-The client number is printed, along with the group id and error of that
-thread. Below is the io statistics, here for writes. In the order listed,
-they denote:
-
-io=		Number of megabytes io performed
-bw=		Average bandwidth rate
-iops=           Average IOs performed per second
-runt=		The runtime of that thread
-	slat=	Submission latency (avg being the average, stdev being the
-		standard deviation). This is the time it took to submit
-		the io. For sync io, the slat is really the completion
-		latency, since queue/complete is one operation there. This
-		value can be in milliseconds or microseconds, fio will choose
-		the most appropriate base and print that. In the example
-		above, milliseconds is the best scale. Note: in --minimal mode
+	Same as :option:`write_bw_log`, but writes IOPS. If no filename is given
+	with this option, the default filename of :file:`jobname_type.x.log` is
+	used, where `x` is the index of the job (`1..N`, where `N` is the number of
+	jobs). Even if the filename is given, fio will still append the type of
+	log. If :option:`per_job_logs` is false, then the filename will not include
+	the job index. See `Log File Formats`_.
+
+.. option:: log_avg_msec=int
+
+	By default, fio will log an entry in the iops, latency, or bw log for every
+	I/O that completes. When writing to the disk log, that can quickly grow to a
+	very large size. Setting this option makes fio average the each log entry
+	over the specified period of time, reducing the resolution of the log.  See
+	:option:`log_max_value` as well. Defaults to 0, logging all entries.
+	Also see `Log File Formats`_.
+
+.. option:: log_hist_msec=int
+
+	Same as :option:`log_avg_msec`, but logs entries for completion latency
+	histograms. Computing latency percentiles from averages of intervals using
+	:option:`log_avg_msec` is inaccurate. Setting this option makes fio log
+	histogram entries over the specified period of time, reducing log sizes for
+	high IOPS devices while retaining percentile accuracy.  See
+	:option:`log_hist_coarseness` as well. Defaults to 0, meaning histogram
+	logging is disabled.
+
+.. option:: log_hist_coarseness=int
+
+	Integer ranging from 0 to 6, defining the coarseness of the resolution of
+	the histogram logs enabled with :option:`log_hist_msec`. For each increment
+	in coarseness, fio outputs half as many bins. Defaults to 0, for which
+	histogram logs contain 1216 latency bins. See `Log File Formats`_.
+
+.. option:: log_max_value=bool
+
+	If :option:`log_avg_msec` is set, fio logs the average over that window. If
+	you instead want to log the maximum value, set this option to 1. Defaults to
+	0, meaning that averaged values are logged.
+
+.. option:: log_offset=bool
+
+	If this is set, the iolog options will include the byte offset for the I/O
+	entry as well as the other data values. Defaults to 0 meaning that
+	offsets are not present in logs. Also see `Log File Formats`_.
+
+.. option:: log_compression=int
+
+	If this is set, fio will compress the I/O logs as it goes, to keep the
+	memory footprint lower. When a log reaches the specified size, that chunk is
+	removed and compressed in the background. Given that I/O logs are fairly
+	highly compressible, this yields a nice memory savings for longer runs. The
+	downside is that the compression will consume some background CPU cycles, so
+	it may impact the run. This, however, is also true if the logging ends up
+	consuming most of the system memory.  So pick your poison. The I/O logs are
+	saved normally at the end of a run, by decompressing the chunks and storing
+	them in the specified log file. This feature depends on the availability of
+	zlib.
+
+.. option:: log_compression_cpus=str
+
+	Define the set of CPUs that are allowed to handle online log compression for
+	the I/O jobs. This can provide better isolation between performance
+	sensitive jobs, and background compression work.
+
+.. option:: log_store_compressed=bool
+
+	If set, fio will store the log files in a compressed format. They can be
+	decompressed with fio, using the :option:`--inflate-log` command line
+	parameter. The files will be stored with a :file:`.fz` suffix.
+
+.. option:: log_unix_epoch=bool
+
+	If set, fio will log Unix timestamps to the log files produced by enabling
+	write_type_log for each log type, instead of the default zero-based
+	timestamps.
+
+.. option:: block_error_percentiles=bool
+
+	If set, record errors in trim block-sized units from writes and trims and
+	output a histogram of how many trims it took to get to errors, and what kind
+	of error was encountered.
+
+.. option:: bwavgtime=int
+
+	Average the calculated bandwidth over the given time. Value is specified in
+	milliseconds. If the job also does bandwidth logging through
+	:option:`write_bw_log`, then the minimum of this option and
+	:option:`log_avg_msec` will be used.  Default: 500ms.
+
+.. option:: iopsavgtime=int
+
+	Average the calculated IOPS over the given time. Value is specified in
+	milliseconds. If the job also does IOPS logging through
+	:option:`write_iops_log`, then the minimum of this option and
+	:option:`log_avg_msec` will be used.  Default: 500ms.
+
+.. option:: disk_util=bool
+
+	Generate disk utilization statistics, if the platform supports it.
+	Default: true.
+
+.. option:: disable_lat=bool
+
+	Disable measurements of total latency numbers. Useful only for cutting back
+	the number of calls to :manpage:`gettimeofday(2)`, as that does impact
+	performance at really high IOPS rates.  Note that to really get rid of a
+	large amount of these calls, this option must be used with
+	:option:`disable_slat` and :option:`disable_bw_measurement` as well.
+
+.. option:: disable_clat=bool
+
+	Disable measurements of completion latency numbers. See
+	:option:`disable_lat`.
+
+.. option:: disable_slat=bool
+
+	Disable measurements of submission latency numbers. See
+	:option:`disable_lat`.
+
+.. option:: disable_bw_measurement=bool, disable_bw=bool
+
+	Disable measurements of throughput/bandwidth numbers. See
+	:option:`disable_lat`.
+
+.. option:: clat_percentiles=bool
+
+	Enable the reporting of percentiles of completion latencies.  This
+	option is mutually exclusive with :option:`lat_percentiles`.
+
+.. option:: lat_percentiles=bool
+
+	Enable the reporting of percentiles of IO latencies. This is similar
+	to :option:`clat_percentiles`, except that this includes the
+	submission latency. This option is mutually exclusive with
+	:option:`clat_percentiles`.
+
+.. option:: percentile_list=float_list
+
+	Overwrite the default list of percentiles for completion latencies and the
+	block error histogram.  Each number is a floating number in the range
+	(0,100], and the maximum length of the list is 20. Use ``:`` to separate the
+	numbers, and list the numbers in ascending order. For example,
+	``--percentile_list=99.5:99.9`` will cause fio to report the values of
+	completion latency below which 99.5% and 99.9% of the observed latencies
+	fell, respectively.
+
+
+Error handling
+~~~~~~~~~~~~~~
+
+.. option:: exitall_on_error
+
+	When one job finishes in error, terminate the rest. The default is to wait
+	for each job to finish.
+
+.. option:: continue_on_error=str
+
+	Normally fio will exit the job on the first observed failure. If this option
+	is set, fio will continue the job when there is a 'non-fatal error' (EIO or
+	EILSEQ) until the runtime is exceeded or the I/O size specified is
+	completed. If this option is used, there are two more stats that are
+	appended, the total error count and the first error. The error field given
+	in the stats is the first error that was hit during the run.
+
+	The allowed values are:
+
+		**none**
+			Exit on any I/O or verify errors.
+
+		**read**
+			Continue on read errors, exit on all others.
+
+		**write**
+			Continue on write errors, exit on all others.
+
+		**io**
+			Continue on any I/O error, exit on all others.
+
+		**verify**
+			Continue on verify errors, exit on all others.
+
+		**all**
+			Continue on all errors.
+
+		**0**
+			Backward-compatible alias for 'none'.
+
+		**1**
+			Backward-compatible alias for 'all'.
+
+.. option:: ignore_error=str
+
+	Sometimes you want to ignore some errors during test in that case you can
+	specify error list for each error type, instead of only being able to
+	ignore the default 'non-fatal error' using :option:`continue_on_error`.
+	``ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST`` errors for
+	given error type is separated with ':'. Error may be symbol ('ENOSPC',
+	'ENOMEM') or integer.  Example::
+
+		ignore_error=EAGAIN,ENOSPC:122
+
+	This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
+	WRITE. This option works by overriding :option:`continue_on_error` with
+	the list of errors for each error type if any.
+
+.. option:: error_dump=bool
+
+	If set dump every error even if it is non fatal, true by default. If
+	disabled only fatal error will be dumped.
+
+Running predefined workloads
+----------------------------
+
+Fio includes predefined profiles that mimic the I/O workloads generated by
+other tools.
+
+.. option:: profile=str
+
+	The predefined workload to run.  Current profiles are:
+
+		**tiobench**
+			Threaded I/O bench (tiotest/tiobench) like workload.
+
+		**act**
+			Aerospike Certification Tool (ACT) like workload.
+
+To view a profile's additional options use :option:`--cmdhelp` after specifying
+the profile.  For example::
+
+	$ fio --profile=act --cmdhelp
+
+Act profile options
+~~~~~~~~~~~~~~~~~~~
+
+.. option:: device-names=str
+	:noindex:
+
+	Devices to use.
+
+.. option:: load=int
+	:noindex:
+
+	ACT load multiplier.  Default: 1.
+
+.. option:: test-duration=time
+	:noindex:
+
+	How long the entire test takes to run.  When the unit is omitted, the value
+	is given in seconds.  Default: 24h.
+
+.. option:: threads-per-queue=int
+	:noindex:
+
+	Number of read I/O threads per device.  Default: 8.
+
+.. option:: read-req-num-512-blocks=int
+	:noindex:
+
+	Number of 512B blocks to read at the time.  Default: 3.
+
+.. option:: large-block-op-kbytes=int
+	:noindex:
+
+	Size of large block ops in KiB (writes).  Default: 131072.
+
+.. option:: prep
+	:noindex:
+
+	Set to run ACT prep phase.
+
+Tiobench profile options
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: size=str
+	:noindex:
+
+	Size in MiB.
+
+.. option:: block=int
+	:noindex:
+
+	Block size in bytes.  Default: 4096.
+
+.. option:: numruns=int
+	:noindex:
+
+	Number of runs.
+
+.. option:: dir=str
+	:noindex:
+
+	Test directory.
+
+.. option:: threads=int
+	:noindex:
+
+	Number of threads.
+
+Interpreting the output
+-----------------------
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --time_based \
+		--rate=1256k --bs=14K --name=quick --runtime=1s --name=mixed \
+		--runtime=2m --rw=rw
+
+Fio spits out a lot of output. While running, fio will display the status of the
+jobs created. An example of that would be::
+
+    Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
+
+The characters inside the first set of square brackets denote the current status of
+each thread.  The first character is the first job defined in the job file, and so
+forth.  The possible values (in typical life cycle order) are:
+
++------+-----+-----------------------------------------------------------+
+| Idle | Run |                                                           |
++======+=====+===========================================================+
+| P    |     | Thread setup, but not started.                            |
++------+-----+-----------------------------------------------------------+
+| C    |     | Thread created.                                           |
++------+-----+-----------------------------------------------------------+
+| I    |     | Thread initialized, waiting or generating necessary data. |
++------+-----+-----------------------------------------------------------+
+|      |  p  | Thread running pre-reading file(s).                       |
++------+-----+-----------------------------------------------------------+
+|      |  /  | Thread is in ramp period.                                 |
++------+-----+-----------------------------------------------------------+
+|      |  R  | Running, doing sequential reads.                          |
++------+-----+-----------------------------------------------------------+
+|      |  r  | Running, doing random reads.                              |
++------+-----+-----------------------------------------------------------+
+|      |  W  | Running, doing sequential writes.                         |
++------+-----+-----------------------------------------------------------+
+|      |  w  | Running, doing random writes.                             |
++------+-----+-----------------------------------------------------------+
+|      |  M  | Running, doing mixed sequential reads/writes.             |
++------+-----+-----------------------------------------------------------+
+|      |  m  | Running, doing mixed random reads/writes.                 |
++------+-----+-----------------------------------------------------------+
+|      |  D  | Running, doing sequential trims.                          |
++------+-----+-----------------------------------------------------------+
+|      |  d  | Running, doing random trims.                              |
++------+-----+-----------------------------------------------------------+
+|      |  F  | Running, currently waiting for :manpage:`fsync(2)`.       |
++------+-----+-----------------------------------------------------------+
+|      |  V  | Running, doing verification of written data.              |
++------+-----+-----------------------------------------------------------+
+| f    |     | Thread finishing.                                         |
++------+-----+-----------------------------------------------------------+
+| E    |     | Thread exited, not reaped by main thread yet.             |
++------+-----+-----------------------------------------------------------+
+| _    |     | Thread reaped.                                            |
++------+-----+-----------------------------------------------------------+
+| X    |     | Thread reaped, exited with an error.                      |
++------+-----+-----------------------------------------------------------+
+| K    |     | Thread reaped, exited due to signal.                      |
++------+-----+-----------------------------------------------------------+
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --runtime=58m \
+		--time_based --rate=2512k --bs=256K --numjobs=10 \
+		--name=readers --rw=read --name=writers --rw=write
+
+Fio will condense the thread string as not to take up more space on the command
+line than needed. For instance, if you have 10 readers and 10 writers running,
+the output would look like this::
+
+    Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
+
+Note that the status string is displayed in order, so it's possible to tell which of
+the jobs are currently doing what.  In the example above this means that jobs 1--10
+are readers and 11--20 are writers.
+
+The other values are fairly self explanatory -- number of threads currently
+running and doing I/O, the number of currently open files (f=), the estimated
+completion percentage, the rate of I/O since last check (read speed listed first,
+then write speed and optionally trim speed) in terms of bandwidth and IOPS,
+and time to completion for the current running group. It's impossible to estimate
+runtime of the following groups (if any).
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --iodepth=16 --ioengine=posixaio --filename=/tmp/fiofile \
+		--direct=1 --size=100M --time_based --runtime=50s --rate_iops=89 \
+		--bs=7K --name=Client1 --rw=write
+
+When fio is done (or interrupted by :kbd:`Ctrl-C`), it will show the data for
+each thread, group of threads, and disks in that order. For each overall thread (or
+group) the output looks like::
+
+	Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
+	  write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
+	    slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
+	    clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
+	     lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
+	    clat percentiles (usec):
+	     |  1.00th=[  302],  5.00th=[  326], 10.00th=[  343], 20.00th=[  363],
+	     | 30.00th=[  392], 40.00th=[  404], 50.00th=[  416], 60.00th=[  445],
+	     | 70.00th=[  816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
+	     | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
+	     | 99.99th=[78119]
+	   bw (  KiB/s): min=  532, max=  686, per=0.10%, avg=622.87, stdev=24.82, samples=  100
+	   iops        : min=   76, max=   98, avg=88.98, stdev= 3.54, samples=  100
+	  lat (usec)   : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
+	  lat (msec)   : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
+	  lat (msec)   : 100=0.65%
+	  cpu          : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
+	  IO depths    : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
+	     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+	     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+	     issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
+	     latency   : target=0, window=0, percentile=100.00%, depth=8
+
+The job name (or first job's name when using :option:`group_reporting`) is printed,
+along with the group id, count of jobs being aggregated, last error id seen (which
+is 0 when there are no errors), pid/tid of that thread and the time the job/group
+completed.  Below are the I/O statistics for each data direction performed (showing
+writes in the example above).  In the order listed, they denote:
+
+**read/write/trim**
+		The string before the colon shows the I/O direction the statistics
+		are for.  **IOPS** is the average I/Os performed per second.  **BW**
+		is the average bandwidth rate shown as: value in power of 2 format
+		(value in power of 10 format).  The last two values show: (**total
+		I/O performed** in power of 2 format / **runtime** of that thread).
+
+**slat**
+		Submission latency (**min** being the minimum, **max** being the
+		maximum, **avg** being the average, **stdev** being the standard
+		deviation).  This is the time it took to submit the I/O.  For
+		sync I/O this row is not displayed as the slat is really the
+		completion latency (since queue/complete is one operation there).
+		This value can be in nanoseconds, microseconds or milliseconds ---
+		fio will choose the most appropriate base and print that (in the
+		example above nanoseconds was the best scale).  Note: in :option:`--minimal` mode
 		latencies are always expressed in microseconds.
-	clat=	Completion latency. Same names as slat, this denotes the
-		time from submission to completion of the io pieces. For
-		sync io, clat will usually be equal (or very close) to 0,
-		as the time from submit to complete is basically just
-		CPU time (io has already been done, see slat explanation).
-	bw=	Bandwidth. Same names as the xlat stats, but also includes
-		an approximate percentage of total aggregate bandwidth
-		this thread received in this group. This last value is
-		only really useful if the threads in this group are on the
-		same disk, since they are then competing for disk access.
-cpu=		CPU usage. User and system time, along with the number
-		of context switches this thread went through, usage of
-		system and user time, and finally the number of major
-		and minor page faults. The CPU utilization numbers are
-		averages for the jobs in that reporting group, while the
-		context and fault counters are summed.
-IO depths=	The distribution of io depths over the job life time. The
-		numbers are divided into powers of 2, so for example the
-		16= entries includes depths up to that value but higher
-		than the previous entry. In other words, it covers the
-		range from 16 to 31.
-IO submit=	How many pieces of IO were submitting in a single submit
-		call. Each entry denotes that amount and below, until
-		the previous entry - eg, 8=100% mean that we submitted
-		anywhere in between 5-8 ios per submit call.
-IO complete=	Like the above submit number, but for completions instead.
-IO issued=	The number of read/write requests issued, and how many
-		of them were short.
-IO latencies=	The distribution of IO completion latencies. This is the
-		time from when IO leaves fio and when it gets completed.
-		The numbers follow the same pattern as the IO depths,
-		meaning that 2=1.6% means that 1.6% of the IO completed
-		within 2 msecs, 20=12.8% means that 12.8% of the IO
-		took more than 10 msecs, but less than (or equal to) 20 msecs.
 
-After each client has been listed, the group statistics are printed. They
-will look like this:
+**clat**
+		Completion latency. Same names as slat, this denotes the time from
+		submission to completion of the I/O pieces. For sync I/O, clat will
+		usually be equal (or very close) to 0, as the time from submit to
+		complete is basically just CPU time (I/O has already been done, see slat
+		explanation).
+
+**lat**
+		Total latency. Same names as slat and clat, this denotes the time from
+		when fio created the I/O unit to completion of the I/O operation.
+
+**bw**
+		Bandwidth statistics based on samples. Same names as the xlat stats,
+		but also includes the number of samples taken (**samples**) and an
+		approximate percentage of total aggregate bandwidth this thread
+		received in its group (**per**). This last value is only really
+		useful if the threads in this group are on the same disk, since they
+		are then competing for disk access.
+
+**iops**
+		IOPS statistics based on samples. Same names as bw.
+
+**lat (nsec/usec/msec)**
+		The distribution of I/O completion latencies. This is the time from when
+		I/O leaves fio and when it gets completed. Unlike the separate
+		read/write/trim sections above, the data here and in the remaining
+		sections apply to all I/Os for the reporting group. 250=0.04% means that
+		0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
+		of the I/Os required 250 to 499us for completion.
+
+**cpu**
+		CPU usage. User and system time, along with the number of context
+		switches this thread went through, usage of system and user time, and
+		finally the number of major and minor page faults. The CPU utilization
+		numbers are averages for the jobs in that reporting group, while the
+		context and fault counters are summed.
 
-Run status group 0 (all jobs):
-   READ: io=64MB, aggrb=22178, minb=11355, maxb=11814, mint=2840msec, maxt=2955msec
-  WRITE: io=64MB, aggrb=1302, minb=666, maxb=669, mint=50093msec, maxt=50320msec
+**IO depths**
+		The distribution of I/O depths over the job lifetime.  The numbers are
+		divided into powers of 2 and each entry covers depths from that value
+		up to those that are lower than the next entry -- e.g., 16= covers
+		depths from 16 to 31.  Note that the range covered by a depth
+		distribution entry can be different to the range covered by the
+		equivalent submit/complete distribution entry.
+
+**IO submit**
+		How many pieces of I/O were submitting in a single submit call. Each
+		entry denotes that amount and below, until the previous entry -- e.g.,
+		16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
+		call.  Note that the range covered by a submit distribution entry can
+		be different to the range covered by the equivalent depth distribution
+		entry.
+
+**IO complete**
+		Like the above submit number, but for completions instead.
+
+**IO issued rwt**
+		The number of read/write/trim requests issued, and how many of them were
+		short or dropped.
+
+**IO latency**
+		These values are for `--latency-target` and related options. When
+		these options are engaged, this section describes the I/O depth required
+		to meet the specified latency target.
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --ioengine=null --iodepth=2 --size=100M --numjobs=2 \
+		--rate_process=poisson --io_limit=32M --name=read --bs=128k \
+		--rate=11M --name=write --rw=write --bs=2k --rate=700k
 
-For each data direction, it prints:
+After each client has been listed, the group statistics are printed. They
+will look like this::
 
-io=		Number of megabytes io performed.
-aggrb=		Aggregate bandwidth of threads in this group.
-minb=		The minimum average bandwidth a thread saw.
-maxb=		The maximum average bandwidth a thread saw.
-mint=		The smallest runtime of the threads in that group.
-maxt=		The longest runtime of the threads in that group.
+    Run status group 0 (all jobs):
+       READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s-10.8MiB/s (10.9MB/s-11.3MB/s), io=64.0MiB (67.1MB), run=2973-3069msec
+      WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s-621KiB/s (630kB/s-636kB/s), io=64.0MiB (67.1MB), run=52747-53223msec
+
+For each data direction it prints:
+
+**bw**
+		Aggregate bandwidth of threads in this group followed by the
+		minimum and maximum bandwidth of all the threads in this group.
+		Values outside of brackets are power-of-2 format and those
+		within are the equivalent value in a power-of-10 format.
+**io**
+		Aggregate I/O performed of all threads in this group. The
+		format is the same as bw.
+**run**
+		The smallest and longest runtimes of the threads in this group.
 
-And finally, the disk statistics are printed. They will look like this:
+And finally, the disk statistics are printed. This is Linux specific. They will look like this::
 
-Disk stats (read/write):
-  sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+  Disk stats (read/write):
+    sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
 
 Each value is printed for both reads and writes, with reads first. The
 numbers denote:
 
-ios=		Number of ios performed by all groups.
-merge=		Number of merges io the io scheduler.
-ticks=		Number of ticks we kept the disk busy.
-io_queue=	Total time spent in the disk queue.
-util=		The disk utilization. A value of 100% means we kept the disk
+**ios**
+		Number of I/Os performed by all groups.
+**merge**
+		Number of merges performed by the I/O scheduler.
+**ticks**
+		Number of ticks we kept the disk busy.
+**in_queue**
+		Total time spent in the disk queue.
+**util**
+		The disk utilization. A value of 100% means we kept the disk
 		busy constantly, 50% would be a disk idling half of the time.
 
-It is also possible to get fio to dump the current output while it is
-running, without terminating the job. To do that, send fio the USR1 signal.
-You can also get regularly timed dumps by using the --status-interval
-parameter, or by creating a file in /tmp named fio-dump-status. If fio
-sees this file, it will unlink it and dump the current output status.
+It is also possible to get fio to dump the current output while it is running,
+without terminating the job. To do that, send fio the **USR1** signal.  You can
+also get regularly timed dumps by using the :option:`--status-interval`
+parameter, or by creating a file in :file:`/tmp` named
+:file:`fio-dump-status`. If fio sees this file, it will unlink it and dump the
+current output status.
+
+
+Terse output
+------------
+
+For scripted usage where you typically want to generate tables or graphs of the
+results, fio can output the results in a semicolon separated format.  The format
+is one long line of values, such as::
 
+    2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
+    A description of this job goes here.
 
-7.0 Terse output
-----------------
+The job description (if provided) follows on a second line.
 
-For scripted usage where you typically want to generate tables or graphs
-of the results, fio can output the results in a semicolon separated format.
-The format is one long line of values, such as:
+To enable terse output, use the :option:`--minimal` or
+:option:`--output-format`\=terse command line options. The
+first value is the version of the terse output format. If the output has to be
+changed for some reason, this number will be incremented by 1 to signify that
+change.
 
-2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
-A description of this job goes here.
+Split up, the format is as follows (comments in brackets denote when a
+field was introduced or whether it's specific to some terse version):
 
-The job description (if provided) follows on a second line.
+    ::
+
+        terse version, fio version [v3], jobname, groupid, error
+
+    READ status::
+
+        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+        Submission latency: min, max, mean, stdev (usec)
+        Completion latency: min, max, mean, stdev (usec)
+        Completion latency percentiles: 20 fields (see below)
+        Total latency: min, max, mean, stdev (usec)
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+        IOPS [v5]: min, max, mean, stdev, number of samples
+
+    WRITE status:
+
+    ::
+
+        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+        Submission latency: min, max, mean, stdev (usec)
+        Completion latency: min, max, mean, stdev (usec)
+        Completion latency percentiles: 20 fields (see below)
+        Total latency: min, max, mean, stdev (usec)
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+        IOPS [v5]: min, max, mean, stdev, number of samples
+
+    TRIM status [all but version 3]:
+
+        Fields are similar to READ/WRITE status.
+
+    CPU usage::
+
+        user, system, context switches, major faults, minor faults
+
+    I/O depths::
+
+        <=1, 2, 4, 8, 16, 32, >=64
+
+    I/O latencies microseconds::
+
+        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+
+    I/O latencies milliseconds::
+
+        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+
+    Disk utilization [v3]::
+
+        disk name, read ios, write ios, read merges, write merges, read ticks, write ticks,
+        time spent in queue, disk utilization percentage
+
+    Additional Info (dependent on continue_on_error, default off)::
+
+        total # errors, first error code
+
+    Additional Info (dependent on description being set)::
+
+        Text description
+
+Completion latency percentiles can be a grouping of up to 20 sets, so for the
+terse output fio writes all of them. Each field will look like this::
 
-To enable terse output, use the --minimal command line option. The first
-value is the version of the terse output format. If the output has to
-be changed for some reason, this number will be incremented by 1 to
-signify that change.
-
-Split up, the format is as follows:
-
-	terse version, fio version, jobname, groupid, error
-	READ status:
-		Total IO (KB), bandwidth (KB/sec), IOPS, runtime (msec)
-		Submission latency: min, max, mean, stdev (usec)
-		Completion latency: min, max, mean, stdev (usec)
-		Completion latency percentiles: 20 fields (see below)
-		Total latency: min, max, mean, stdev (usec)
-		Bw (KB/s): min, max, aggregate percentage of total, mean, stdev
-	WRITE status:
-		Total IO (KB), bandwidth (KB/sec), IOPS, runtime (msec)
-		Submission latency: min, max, mean, stdev (usec)
-		Completion latency: min, max, mean, stdev(usec)
-		Completion latency percentiles: 20 fields (see below)
-		Total latency: min, max, mean, stdev (usec)
-		Bw (KB/s): min, max, aggregate percentage of total, mean, stdev
-	CPU usage: user, system, context switches, major faults, minor faults
-	IO depths: <=1, 2, 4, 8, 16, 32, >=64
-	IO latencies microseconds: <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
-	IO latencies milliseconds: <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
-	Disk utilization: Disk name, Read ios, write ios,
-			  Read merges, write merges,
-			  Read ticks, write ticks,
-			  Time spent in queue, disk utilization percentage
-	Additional Info (dependent on continue_on_error, default off): total # errors, first error code
-
-	Additional Info (dependent on description being set): Text description
-
-Completion latency percentiles can be a grouping of up to 20 sets, so
-for the terse output fio writes all of them. Each field will look like this:
-
-	1.00%=6112
-
-which is the Xth percentile, and the usec latency associated with it.
-
-For disk utilization, all disks used by fio are shown. So for each disk
-there will be a disk utilization section.
-
-
-8.0 Trace file format
----------------------
-There are two trace file format that you can encounter. The older (v1) format
-is unsupported since version 1.20-rc3 (March 2008). It will still be described
+        1.00%=6112
+
+which is the Xth percentile, and the `usec` latency associated with it.
+
+For `Disk utilization`, all disks used by fio are shown. So for each disk there
+will be a disk utilization section.
+
+Below is a single line containing short names for each of the fields in the
+minimal output v3, separated by semicolons::
+
+        terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+
+
+JSON output
+------------
+
+The `json` output format is intended to be both human readable and convenient
+for automated parsing. For the most part its sections mirror those of the
+`normal` output. The `runtime` value is reported in msec and the `bw` value is
+reported in 1024 bytes per second units.
+
+
+JSON+ output
+------------
+
+The `json+` output format is identical to the `json` output format except that it
+adds a full dump of the completion latency bins. Each `bins` object contains a
+set of (key, value) pairs where keys are latency durations and values count how
+many I/Os had completion latencies of the corresponding duration. For example,
+consider:
+
+	"bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
+
+This data indicates that one I/O required 87,552ns to complete, two I/Os required
+100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
+
+Also included with fio is a Python script `fio_jsonplus_clat2csv` that takes
+json+ output and generates CSV-formatted latency data suitable for plotting.
+
+The latency durations actually represent the midpoints of latency intervals.
+For details refer to :file:`stat.h`.
+
+
+Trace file format
+-----------------
+
+There are two trace file format that you can encounter. The older (v1) format is
+unsupported since version 1.20-rc3 (March 2008). It will still be described
 below in case that you get an old trace and want to understand it.
 
 In any case the trace is a simple text file with a single action per line.
 
 
-8.1 Trace file format v1
-------------------------
-Each line represents a single io action in the following format:
+Trace file format v1
+~~~~~~~~~~~~~~~~~~~~
+
+Each line represents a single I/O action in the following format::
+
+	rw, offset, length
 
-rw, offset, length
+where `rw=0/1` for read/write, and the `offset` and `length` entries being in bytes.
 
-where rw=0/1 for read/write, and the offset and length entries being in bytes.
+This format is not supported in fio versions >= 1.20-rc3.
 
-This format is not supported in Fio versions => 1.20-rc3.
 
+Trace file format v2
+~~~~~~~~~~~~~~~~~~~~
 
-8.2 Trace file format v2
-------------------------
-The second version of the trace file format was added in Fio version 1.17.
-It allows to access more then one file per trace and has a bigger set of
-possible file actions.
+The second version of the trace file format was added in fio version 1.17.  It
+allows to access more then one file per trace and has a bigger set of possible
+file actions.
 
-The first line of the trace file has to be:
+The first line of the trace file has to be::
 
-fio version 2 iolog
+    fio version 2 iolog
 
 Following this can be lines in two different formats, which are described below.
 
-The file management format:
+The file management format::
 
-filename action
+    filename action
 
-The filename is given as an absolute path. The action can be one of these:
+The `filename` is given as an absolute path. The `action` can be one of these:
+
+**add**
+		Add the given `filename` to the trace.
+**open**
+		Open the file with the given `filename`. The `filename` has to have
+		been added with the **add** action before.
+**close**
+		Close the file with the given `filename`. The file has to have been
+		opened before.
+
+
+The file I/O action format::
+
+    filename action offset length
+
+The `filename` is given as an absolute path, and has to have been added and
+opened before it can be used with this format. The `offset` and `length` are
+given in bytes. The `action` can be one of these:
+
+**wait**
+	   Wait for `offset` microseconds. Everything below 100 is discarded.
+	   The time is relative to the previous `wait` statement.
+**read**
+	   Read `length` bytes beginning from `offset`.
+**write**
+	   Write `length` bytes beginning from `offset`.
+**sync**
+	   :manpage:`fsync(2)` the file.
+**datasync**
+	   :manpage:`fdatasync(2)` the file.
+**trim**
+	   Trim the given file from the given `offset` for `length` bytes.
+
+CPU idleness profiling
+----------------------
+
+In some cases, we want to understand CPU overhead in a test. For example, we
+test patches for the specific goodness of whether they reduce CPU usage.
+Fio implements a balloon approach to create a thread per CPU that runs at idle
+priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each CPU
+can be derived accordingly.
+
+An unit work is defined as touching a full page of unsigned characters. Mean and
+standard deviation of time to complete an unit work is reported in "unit work"
+section. Options can be chosen to report detailed percpu idleness or overall
+system idleness by aggregating percpu stats.
+
+
+Verification and triggers
+-------------------------
 
-add          Add the given filename to the trace
-open         Open the file with the given filename. The filename has to have
-             been added with the add action before.
-close        Close the file with the given filename. The file has to have been
-             opened before.
-
-
-The file io action format:
-
-filename action offset length
-
-The filename is given as an absolute path, and has to have been added and opened
-before it can be used with this format. The offset and length are given in
-bytes. The action can be one of these:
-
-wait       Wait for 'offset' microseconds. Everything below 100 is discarded.
-	   The time is relative to the previous wait statement.
-read       Read 'length' bytes beginning from 'offset'
-write      Write 'length' bytes beginning from 'offset'
-sync       fsync() the file
-datasync   fdatasync() the file
-trim       trim the given file from the given 'offset' for 'length' bytes
-
-
-9.0 CPU idleness profiling
---------------------------
-In some cases, we want to understand CPU overhead in a test. For example,
-we test patches for the specific goodness of whether they reduce CPU usage.
-fio implements a balloon approach to create a thread per CPU that runs at
-idle priority, meaning that it only runs when nobody else needs the cpu.
-By measuring the amount of work completed by the thread, idleness of each
-CPU can be derived accordingly.
-
-An unit work is defined as touching a full page of unsigned characters. Mean
-and standard deviation of time to complete an unit work is reported in "unit
-work" section. Options can be chosen to report detailed percpu idleness or
-overall system idleness by aggregating percpu stats.
-
-
-10.0 Verification and triggers
-------------------------------
-Fio is usually run in one of two ways, when data verification is done. The
-first is a normal write job of some sort with verify enabled. When the
-write phase has completed, fio switches to reads and verifies everything
-it wrote. The second model is running just the write phase, and then later
-on running the same job (but with reads instead of writes) to repeat the
-same IO patterns and verify the contents. Both of these methods depend
-on the write phase being completed, as fio otherwise has no idea how much
-data was written.
-
-With verification triggers, fio supports dumping the current write state
-to local files. Then a subsequent read verify workload can load this state
-and know exactly where to stop. This is useful for testing cases where
-power is cut to a server in a managed fashion, for instance.
+Fio is usually run in one of two ways, when data verification is done. The first
+is a normal write job of some sort with verify enabled. When the write phase has
+completed, fio switches to reads and verifies everything it wrote. The second
+model is running just the write phase, and then later on running the same job
+(but with reads instead of writes) to repeat the same I/O patterns and verify
+the contents. Both of these methods depend on the write phase being completed,
+as fio otherwise has no idea how much data was written.
+
+With verification triggers, fio supports dumping the current write state to
+local files. Then a subsequent read verify workload can load this state and know
+exactly where to stop. This is useful for testing cases where power is cut to a
+server in a managed fashion, for instance.
 
 A verification trigger consists of two things:
 
-1) Storing the write state of each job
-2) Executing a trigger command
+1) Storing the write state of each job.
+2) Executing a trigger command.
 
-The write state is relatively small, on the order of hundreds of bytes
-to single kilobytes. It contains information on the number of completions
-done, the last X completions, etc.
-
-A trigger is invoked either through creation ('touch') of a specified
-file in the system, or through a timeout setting. If fio is run with
---trigger-file=/tmp/trigger-file, then it will continually check for
-the existence of /tmp/trigger-file. When it sees this file, it will
-fire off the trigger (thus saving state, and executing the trigger
+The write state is relatively small, on the order of hundreds of bytes to single
+kilobytes. It contains information on the number of completions done, the last X
+completions, etc.
+
+A trigger is invoked either through creation ('touch') of a specified file in
+the system, or through a timeout setting. If fio is run with
+:option:`--trigger-file`\= :file:`/tmp/trigger-file`, then it will continually
+check for the existence of :file:`/tmp/trigger-file`. When it sees this file, it
+will fire off the trigger (thus saving state, and executing the trigger
 command).
 
-For client/server runs, there's both a local and remote trigger. If
-fio is running as a server backend, it will send the job states back
-to the client for safe storage, then execute the remote trigger, if
-specified. If a local trigger is specified, the server will still send
-back the write state, but the client will then execute the trigger.
+For client/server runs, there's both a local and remote trigger. If fio is
+running as a server backend, it will send the job states back to the client for
+safe storage, then execute the remote trigger, if specified. If a local trigger
+is specified, the server will still send back the write state, but the client
+will then execute the trigger.
 
-10.1 Verification trigger example
----------------------------------
-Lets say we want to run a powercut test on the remote machine 'server'.
-Our write workload is in write-test.fio. We want to cut power to 'server'
-at some point during the run, and we'll run this test from the safety
-or our local machine, 'localbox'. On the server, we'll start the fio
-backend normally:
+Verification trigger example
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-server# fio --server
+Let's say we want to run a powercut test on the remote Linux machine 'server'.
+Our write workload is in :file:`write-test.fio`. We want to cut power to 'server' at
+some point during the run, and we'll run this test from the safety or our local
+machine, 'localbox'. On the server, we'll start the fio backend normally::
 
-and on the client, we'll fire off the workload:
+	server# fio --server
 
-localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
+and on the client, we'll fire off the workload::
 
-We set /tmp/my-trigger as the trigger file, and we tell fio to execute
+	localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
 
-echo b > /proc/sysrq-trigger
+We set :file:`/tmp/my-trigger` as the trigger file, and we tell fio to execute::
 
-on the server once it has received the trigger and sent us the write
-state. This will work, but it's not _really_ cutting power to the server,
-it's merely abruptly rebooting it. If we have a remote way of cutting
-power to the server through IPMI or similar, we could do that through
-a local trigger command instead. Lets assume we have a script that does
-IPMI reboot of a given hostname, ipmi-reboot. On localbox, we could
-then have run fio with a local trigger instead:
+	echo b > /proc/sysrq-trigger
 
-localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
+on the server once it has received the trigger and sent us the write state. This
+will work, but it's not **really** cutting power to the server, it's merely
+abruptly rebooting it. If we have a remote way of cutting power to the server
+through IPMI or similar, we could do that through a local trigger command
+instead. Let's assume we have a script that does IPMI reboot of a given hostname,
+ipmi-reboot. On localbox, we could then have run fio with a local trigger
+instead::
 
-For this case, fio would wait for the server to send us the write state,
-then execute 'ipmi-reboot server' when that happened.
+	localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
 
-10.2 Loading verify state
--------------------------
-To load store write state, read verification job file must contain
-the verify_state_load option. If that is set, fio will load the previously
+For this case, fio would wait for the server to send us the write state, then
+execute ``ipmi-reboot server`` when that happened.
+
+Loading verify state
+~~~~~~~~~~~~~~~~~~~~
+
+To load stored write state, a read verification job file must contain the
+:option:`verify_state_load` option. If that is set, fio will load the previously
 stored state. For a local fio run this is done by loading the files directly,
-and on a client/server run, the server backend will ask the client to send
-the files over and load them from there.
+and on a client/server run, the server backend will ask the client to send the
+files over and load them from there.
 
 
-11.0 Log File Formats
----------------------
+Log File Formats
+----------------
 
 Fio supports a variety of log file formats, for logging latencies, bandwidth,
 and IOPS. The logs share a common format, which looks like this:
 
-time (msec), value, data direction, offset
+    *time* (`msec`), *value*, *data direction*, *block size* (`bytes`),
+    *offset* (`bytes`)
 
-Time for the log entry is always in milliseconds. The value logged depends
+*Time* for the log entry is always in milliseconds. The *value* logged depends
 on the type of log, it will be one of the following:
 
-	Latency log		Value is latency in usecs
-	Bandwidth log		Value is in KB/sec
-	IOPS log		Value is IOPS
-
-Data direction is one of the following:
-
-	0			IO is a READ
-	1			IO is a WRITE
-	2			IO is a TRIM
-
-The offset is the offset, in bytes, from the start of the file, for that
-particular IO. The logging of the offset can be toggled with 'log_offset'.
-
-If windowed logging is enabled through 'log_avg_msec', then fio doesn't log
-individual IOs. Instead of logs the average values over the specified
-period of time. Since 'data direction' and 'offset' are per-IO values,
-they aren't applicable if windowed logging is enabled. If windowed logging
-is enabled and 'log_max_value' is set, then fio logs maximum values in
-that window instead of averages.
+    **Latency log**
+		Value is latency in nsecs
+    **Bandwidth log**
+		Value is in KiB/sec
+    **IOPS log**
+		Value is IOPS
+
+*Data direction* is one of the following:
+
+	**0**
+		I/O is a READ
+	**1**
+		I/O is a WRITE
+	**2**
+		I/O is a TRIM
+
+The entry's *block size* is always in bytes. The *offset* is the offset, in bytes,
+from the start of the file, for that particular I/O. The logging of the offset can be
+toggled with :option:`log_offset`.
+
+Fio defaults to logging every individual I/O.  When IOPS are logged for individual
+I/Os the *value* entry will always be 1. If windowed logging is enabled through
+:option:`log_avg_msec`, fio logs the average values over the specified period of time.
+If windowed logging is enabled and :option:`log_max_value` is set, then fio logs
+maximum values in that window instead of averages. Since *data direction*, *block
+size* and *offset* are per-I/O values, if windowed logging is enabled they
+aren't applicable and will be 0.
+
+Client/Server
+-------------
+
+Normally fio is invoked as a stand-alone application on the machine where the
+I/O workload should be generated. However, the backend and frontend of fio can
+be run separately i.e., the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled by a client on another machine.
+
+Start the server on the machine which has access to the storage DUT::
+
+	$ fio --server=args
+
+where `args` defines what fio listens to. The arguments are of the form
+``type,hostname`` or ``IP,port``. *type* is either ``ip`` (or ip4) for TCP/IP
+v4, ``ip6`` for TCP/IP v6, or ``sock`` for a local unix domain socket.
+*hostname* is either a hostname or IP address, and *port* is the port to listen
+to (only valid for TCP/IP, not a local socket). Some examples:
+
+1) ``fio --server``
+
+   Start a fio server, listening on all interfaces on the default port (8765).
+
+2) ``fio --server=ip:hostname,4444``
+
+   Start a fio server, listening on IP belonging to hostname and on port 4444.
+
+3) ``fio --server=ip6:::1,4444``
+
+   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
+
+4) ``fio --server=,4444``
+
+   Start a fio server, listening on all interfaces on port 4444.
+
+5) ``fio --server=1.2.3.4``
+
+   Start a fio server, listening on IP 1.2.3.4 on the default port.
+
+6) ``fio --server=sock:/tmp/fio.sock``
+
+   Start a fio server, listening on the local socket :file:`/tmp/fio.sock`.
+
+Once a server is running, a "client" can connect to the fio server with::
+
+	fio <local-args> --client=<server> <remote-args> <job file(s)>
+
+where `local-args` are arguments for the client where it is running, `server`
+is the connect string, and `remote-args` and `job file(s)` are sent to the
+server. The `server` string follows the same format as it does on the server
+side, to allow IP/hostname/socket and port strings.
+
+Fio can connect to multiple servers this way::
+
+    fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
+
+If the job file is located on the fio server, then you can tell the server to
+load a local file as well. This is done by using :option:`--remote-config` ::
+
+   fio --client=server --remote-config /path/to/file.fio
+
+Then fio will open this local (to the server) job file instead of being passed
+one from the client.
+
+If you have many servers (example: 100 VMs/containers), you can input a pathname
+of a file containing host IPs/names as the parameter value for the
+:option:`--client` option.  For example, here is an example :file:`host.list`
+file containing 2 hostnames::
+
+	host1.your.dns.domain
+	host2.your.dns.domain
+
+The fio command would then be::
+
+    fio --client=host.list <job file(s)>
+
+In this mode, you cannot input server-specific parameters or job files -- all
+servers receive the same job file.
+
+In order to let ``fio --client`` runs use a shared filesystem from multiple
+hosts, ``fio --client`` now prepends the IP address of the server to the
+filename.  For example, if fio is using the directory :file:`/mnt/nfs/fio` and is
+writing filename :file:`fileio.tmp`, with a :option:`--client` `hostfile`
+containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and
+192.168.10.121, then fio will create two files::
+
+	/mnt/nfs/fio/192.168.10.120.fileio.tmp
+	/mnt/nfs/fio/192.168.10.121.fileio.tmp
diff -Nru fio-2.16/idletime.c fio-3.1/idletime.c
--- fio-2.16/idletime.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/idletime.c	2017-09-28 10:23:20.000000000 +0000
@@ -11,7 +11,7 @@
 static double calibrate_unit(unsigned char *data)
 {
 	unsigned long t, i, j, k;
-	struct timeval tps;
+	struct timespec tps;
 	double tunit = 0.0;
 
 	for (i = 0; i < CALIBRATE_RUNS; i++) {
@@ -183,7 +183,6 @@
 void fio_idle_prof_init(void)
 {
 	int i, ret;
-	struct timeval tp;
 	struct timespec ts;
 	pthread_attr_t tattr;
 	struct idle_prof_thread *ipt;
@@ -282,9 +281,8 @@
 		pthread_mutex_lock(&ipt->init_lock);
 		while ((ipt->state != TD_EXITED) &&
 		       (ipt->state!=TD_INITIALIZED)) {
-			fio_gettime(&tp, NULL);
-			ts.tv_sec = tp.tv_sec + 1;
-			ts.tv_nsec = tp.tv_usec * 1000;
+			fio_gettime(&ts, NULL);
+			ts.tv_sec += 1;
 			pthread_cond_timedwait(&ipt->cond, &ipt->init_lock, &ts);
 		}
 		pthread_mutex_unlock(&ipt->init_lock);
@@ -325,7 +323,6 @@
 {
 	int i;
 	uint64_t runt;
-	struct timeval tp;
 	struct timespec ts;
 	struct idle_prof_thread *ipt;
 
@@ -343,9 +340,8 @@
 		pthread_mutex_lock(&ipt->start_lock);
 		while ((ipt->state != TD_EXITED) &&
 		       (ipt->state!=TD_NOT_CREATED)) {
-			fio_gettime(&tp, NULL);
-			ts.tv_sec = tp.tv_sec + 1;
-			ts.tv_nsec = tp.tv_usec * 1000;
+			fio_gettime(&ts, NULL);
+			ts.tv_sec += 1;
 			/* timed wait in case a signal is not received */
 			pthread_cond_timedwait(&ipt->cond, &ipt->start_lock, &ts);
 		}
diff -Nru fio-2.16/idletime.h fio-3.1/idletime.h
--- fio-2.16/idletime.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/idletime.h	2017-09-28 10:23:20.000000000 +0000
@@ -26,8 +26,8 @@
 	pthread_t thread;
 	int cpu;
 	int state;
-	struct timeval tps;
-	struct timeval tpe;
+	struct timespec tps;
+	struct timespec tpe;
 	double cali_time; /* microseconds to finish a unit work */
 	double loops;
 	double idleness;
diff -Nru fio-2.16/init.c fio-3.1/init.c
--- fio-2.16/init.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/init.c	2017-09-28 10:23:20.000000000 +0000
@@ -31,6 +31,7 @@
 #include "oslib/strcasestr.h"
 
 #include "crc/test.h"
+#include "lib/pow2.h"
 
 const char fio_version_string[] = FIO_VERSION;
 
@@ -39,7 +40,6 @@
 static char **ini_file;
 static int max_jobs = FIO_MAX_JOBS;
 static int dump_cmdline;
-static long long def_timeout;
 static int parse_only;
 
 static struct thread_data def_thread;
@@ -93,11 +93,6 @@
 		.val		= 'o' | FIO_CLIENT_FLAG,
 	},
 	{
-		.name		= (char *) "timeout",
-		.has_arg	= required_argument,
-		.val		= 't' | FIO_CLIENT_FLAG,
-	},
-	{
 		.name		= (char *) "latency-log",
 		.has_arg	= required_argument,
 		.val		= 'l' | FIO_CLIENT_FLAG,
@@ -361,10 +356,12 @@
 		perror("shmat");
 		return 1;
 	}
+	if (shm_attach_to_open_removed())
+		shmctl(shm_id, IPC_RMID, NULL);
 #endif
 
 	memset(threads, 0, max_jobs * sizeof(struct thread_data));
-	fio_debug_jobp = (void *) threads + max_jobs * sizeof(struct thread_data);
+	fio_debug_jobp = (unsigned int *)(threads + max_jobs);
 	*fio_debug_jobp = -1;
 
 	flow_init();
@@ -372,14 +369,6 @@
 	return 0;
 }
 
-static void set_cmd_options(struct thread_data *td)
-{
-	struct thread_options *o = &td->o;
-
-	if (!o->timeout)
-		o->timeout = def_timeout;
-}
-
 static void dump_print_option(struct print_option *p)
 {
 	const char *delim;
@@ -445,15 +434,13 @@
 /*
  * Return a free job structure.
  */
-static struct thread_data *get_new_job(int global, struct thread_data *parent,
-				       int preserve_eo, const char *jobname)
+static struct thread_data *get_new_job(bool global, struct thread_data *parent,
+				       bool preserve_eo, const char *jobname)
 {
 	struct thread_data *td;
 
-	if (global) {
-		set_cmd_options(&def_thread);
+	if (global)
 		return &def_thread;
-	}
 	if (setup_thread_area()) {
 		log_err("error: failed to setup shm segment\n");
 		return NULL;
@@ -472,6 +459,7 @@
 		copy_opt_list(td, parent);
 
 	td->io_ops = NULL;
+	td->io_ops_init = 0;
 	if (!preserve_eo)
 		td->eo = NULL;
 
@@ -491,7 +479,6 @@
 	if (!parent->o.group_reporting || parent == &def_thread)
 		stat_number++;
 
-	set_cmd_options(td);
 	return td;
 }
 
@@ -536,7 +523,7 @@
 
 	td->rate_next_io_time[ddir] = 0;
 	td->rate_io_issue_bytes[ddir] = 0;
-	td->last_usec = 0;
+	td->last_usec[ddir] = 0;
 	return 0;
 }
 
@@ -581,6 +568,17 @@
 }
 
 /*
+ * <3 Johannes
+ */
+static unsigned int gcd(unsigned int m, unsigned int n)
+{
+	if (!n)
+		return m;
+
+	return gcd(n, m % n);
+}
+
+/*
  * Lazy way of fixing up options that depend on each other. We could also
  * define option callback handlers, but this is easier.
  */
@@ -589,7 +587,7 @@
 	struct thread_options *o = &td->o;
 	int ret = 0;
 
-#ifndef FIO_HAVE_PSHARED_MUTEX
+#ifndef CONFIG_PSHARED
 	if (!o->use_thread) {
 		log_info("fio: this platform does not support process shared"
 			 " mutexes, forcing use of threads. Use the 'thread'"
@@ -622,7 +620,7 @@
 	/*
 	 * Reads can do overwrites, we always need to pre-create the file
 	 */
-	if (td_read(td) || td_rw(td))
+	if (td_read(td))
 		o->overwrite = 1;
 
 	if (!o->min_bs[DDIR_READ])
@@ -700,6 +698,23 @@
 	if (o->iodepth_batch_complete_min > o->iodepth_batch_complete_max)
 		o->iodepth_batch_complete_max = o->iodepth_batch_complete_min;
 
+	/*
+	 * There's no need to check for in-flight overlapping IOs if the job
+	 * isn't changing data or the maximum iodepth is guaranteed to be 1
+	 */
+	if (o->serialize_overlap && !(td->flags & TD_F_READ_IOLOG) &&
+	    (!(td_write(td) || td_trim(td)) || o->iodepth == 1))
+		o->serialize_overlap = 0;
+	/*
+	 * Currently can't check for overlaps in offload mode
+	 */
+	if (o->serialize_overlap && o->io_submit_mode == IO_MODE_OFFLOAD) {
+		log_err("fio: checking for in-flight overlaps when the "
+			"io_submit_mode is offload is not supported\n");
+		o->serialize_overlap = 0;
+		ret = warnings_fatal;
+	}
+
 	if (o->nr_files > td->files_index)
 		o->nr_files = td->files_index;
 
@@ -733,13 +748,30 @@
 		o->size = -1ULL;
 
 	if (o->verify != VERIFY_NONE) {
-		if (td_write(td) && o->do_verify && o->numjobs > 1) {
-			log_info("Multiple writers may overwrite blocks that "
-				"belong to other jobs. This can cause "
+		if (td_write(td) && o->do_verify && o->numjobs > 1 &&
+		    (o->filename ||
+		     !(o->unique_filename &&
+		       strstr(o->filename_format, "$jobname") &&
+		       strstr(o->filename_format, "$jobnum") &&
+		       strstr(o->filename_format, "$filenum")))) {
+			log_info("fio: multiple writers may overwrite blocks "
+				"that belong to other jobs. This can cause "
 				"verification failures.\n");
 			ret = warnings_fatal;
 		}
 
+		/*
+		 * Warn if verification is requested but no verification of any
+		 * kind can be started due to time constraints
+		 */
+		if (td_write(td) && o->do_verify && o->timeout &&
+		    o->time_based && !td_read(td) && !o->verify_backlog) {
+			log_info("fio: verification read phase will never "
+				 "start because write phase uses all of "
+				 "runtime\n");
+			ret = warnings_fatal;
+		}
+
 		if (!fio_option_is_set(o, refill_buffers))
 			o->refill_buffers = 1;
 
@@ -755,10 +787,20 @@
 			o->verify_interval = o->min_bs[DDIR_WRITE];
 		else if (td_read(td) && o->verify_interval > o->min_bs[DDIR_READ])
 			o->verify_interval = o->min_bs[DDIR_READ];
+
+		/*
+		 * Verify interval must be a factor or both min and max
+		 * write size
+		 */
+		if (o->verify_interval % o->min_bs[DDIR_WRITE] ||
+		    o->verify_interval % o->max_bs[DDIR_WRITE])
+			o->verify_interval = gcd(o->min_bs[DDIR_WRITE],
+							o->max_bs[DDIR_WRITE]);
 	}
 
 	if (o->pre_read) {
-		o->invalidate_cache = 0;
+		if (o->invalidate_cache)
+			o->invalidate_cache = 0;
 		if (td_ioengine_flagged(td, FIO_PIPEIO)) {
 			log_info("fio: cannot pre-read files with an IO engine"
 				 " that isn't seekable. Pre-read disabled.\n");
@@ -773,6 +815,11 @@
 			o->unit_base = 8;
 	}
 
+#ifndef FIO_HAVE_ANY_FALLOCATE
+	/* Platform doesn't support any fallocate so force it to none */
+	o->fallocate_mode = FIO_FALLOCATE_NONE;
+#endif
+
 #ifndef CONFIG_FDATASYNC
 	if (o->fdatasync_blocks) {
 		log_info("fio: this platform does not support fdatasync()"
@@ -790,7 +837,7 @@
 	 * Windows doesn't support O_DIRECT or O_SYNC with the _open interface,
 	 * so fail if we're passed those flags
 	 */
-	if (td_ioengine_flagged(td, FIO_SYNCIO) && (td->o.odirect || td->o.sync_io)) {
+	if (td_ioengine_flagged(td, FIO_SYNCIO) && (o->odirect || o->sync_io)) {
 		log_err("fio: Windows does not support direct or non-buffered io with"
 				" the synchronous ioengines. Use the 'windowsaio' ioengine"
 				" with 'direct=1' and 'iodepth=1' instead.\n");
@@ -816,8 +863,8 @@
 	 * Using a non-uniform random distribution excludes usage of
 	 * a random map
 	 */
-	if (td->o.random_distribution != FIO_RAND_DIST_RANDOM)
-		td->o.norandommap = 1;
+	if (o->random_distribution != FIO_RAND_DIST_RANDOM)
+		o->norandommap = 1;
 
 	/*
 	 * If size is set but less than the min block size, complain
@@ -831,16 +878,16 @@
 	/*
 	 * O_ATOMIC implies O_DIRECT
 	 */
-	if (td->o.oatomic)
-		td->o.odirect = 1;
+	if (o->oatomic)
+		o->odirect = 1;
 
 	/*
 	 * If randseed is set, that overrides randrepeat
 	 */
-	if (fio_option_is_set(&td->o, rand_seed))
-		td->o.rand_repeatable = 0;
+	if (fio_option_is_set(o, rand_seed))
+		o->rand_repeatable = 0;
 
-	if (td_ioengine_flagged(td, FIO_NOEXTEND) && td->o.file_append) {
+	if (td_ioengine_flagged(td, FIO_NOEXTEND) && o->file_append) {
 		log_err("fio: can't append/extent with IO engine %s\n", td->io_ops->name);
 		ret = 1;
 	}
@@ -855,49 +902,28 @@
 	if (!td->loops)
 		td->loops = 1;
 
-	if (td->o.block_error_hist && td->o.nr_files != 1) {
+	if (o->block_error_hist && o->nr_files != 1) {
 		log_err("fio: block error histogram only available "
 			"with a single file per job, but %d files "
-			"provided\n", td->o.nr_files);
+			"provided\n", o->nr_files);
 		ret = 1;
 	}
 
-	return ret;
-}
-
-/*
- * This function leaks the buffer
- */
-char *fio_uint_to_kmg(unsigned int val)
-{
-	char *buf = malloc(32);
-	char post[] = { 0, 'K', 'M', 'G', 'P', 'E', 0 };
-	char *p = post;
-
-	do {
-		if (val & 1023)
-			break;
-
-		val >>= 10;
-		p++;
-	} while (*p);
-
-	snprintf(buf, 32, "%u%c", val, *p);
-	return buf;
-}
-
-/* External engines are specified by "external:name.o") */
-static const char *get_engine_name(const char *str)
-{
-	char *p = strstr(str, ":");
-
-	if (!p)
-		return str;
+	if (fio_option_is_set(o, clat_percentiles) &&
+	    !fio_option_is_set(o, lat_percentiles)) {
+		o->lat_percentiles = !o->clat_percentiles;
+	} else if (fio_option_is_set(o, lat_percentiles) &&
+		   !fio_option_is_set(o, clat_percentiles)) {
+		o->clat_percentiles = !o->lat_percentiles;
+	} else if (fio_option_is_set(o, lat_percentiles) &&
+		   fio_option_is_set(o, clat_percentiles) &&
+		   o->lat_percentiles && o->clat_percentiles) {
+		log_err("fio: lat_percentiles and clat_percentiles are "
+			"mutually exclusive\n");
+		ret = 1;
+	}
 
-	p++;
-	strip_blank_front(&p);
-	strip_blank_end(p);
-	return p;
+	return ret;
 }
 
 static void init_rand_file_service(struct thread_data *td)
@@ -922,9 +948,9 @@
 	bool use64;
 
 	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
-		use64 = 1;
+		use64 = true;
 	else
-		use64 = 0;
+		use64 = false;
 
 	init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF],
 		use64);
@@ -934,7 +960,22 @@
 {
 	int i;
 
-	init_rand_seed(&td->bsrange_state, td->rand_seeds[FIO_RAND_BS_OFF], use64);
+	/*
+	 * trimwrite is special in that we need to generate the same
+	 * offsets to get the "write after trim" effect. If we are
+	 * using bssplit to set buffer length distributions, ensure that
+	 * we seed the trim and write generators identically.
+	 */
+	if (td_trimwrite(td)) {
+		init_rand_seed(&td->bsrange_state[DDIR_READ], td->rand_seeds[FIO_RAND_BS_OFF], use64);
+		init_rand_seed(&td->bsrange_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_BS1_OFF], use64);
+		init_rand_seed(&td->bsrange_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_BS1_OFF], use64);
+	} else {
+		init_rand_seed(&td->bsrange_state[DDIR_READ], td->rand_seeds[FIO_RAND_BS_OFF], use64);
+		init_rand_seed(&td->bsrange_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_BS1_OFF], use64);
+		init_rand_seed(&td->bsrange_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_BS2_OFF], use64);
+	}
+
 	td_fill_verify_state_seed(td);
 	init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], false);
 
@@ -946,7 +987,9 @@
 	init_rand_seed(&td->file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF], use64);
 	init_rand_seed(&td->trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF], use64);
 	init_rand_seed(&td->delay_state, td->rand_seeds[FIO_RAND_START_DELAY], use64);
-	init_rand_seed(&td->poisson_state, td->rand_seeds[FIO_RAND_POISSON_OFF], 0);
+	init_rand_seed(&td->poisson_state[0], td->rand_seeds[FIO_RAND_POISSON_OFF], 0);
+	init_rand_seed(&td->poisson_state[1], td->rand_seeds[FIO_RAND_POISSON2_OFF], 0);
+	init_rand_seed(&td->poisson_state[2], td->rand_seeds[FIO_RAND_POISSON3_OFF], 0);
 	init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false);
 	init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false);
 
@@ -978,9 +1021,9 @@
 	}
 
 	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
-		use64 = 1;
+		use64 = true;
 	else
-		use64 = 0;
+		use64 = false;
 
 	td_fill_rand_seeds_internal(td, use64);
 
@@ -994,22 +1037,27 @@
  */
 int ioengine_load(struct thread_data *td)
 {
-	const char *engine;
-
-	/*
-	 * Engine has already been loaded.
-	 */
-	if (td->io_ops)
-		return 0;
 	if (!td->o.ioengine) {
 		log_err("fio: internal fault, no IO engine specified\n");
 		return 1;
 	}
 
-	engine = get_engine_name(td->o.ioengine);
-	td->io_ops = load_ioengine(td, engine);
+	if (td->io_ops) {
+		/* An engine is loaded, but the requested ioengine
+		 * may have changed.
+		 */
+		if (!strcmp(td->io_ops->name, td->o.ioengine)) {
+			/* The right engine is already loaded */
+			return 0;
+		}
+
+		/* Unload the old engine. */
+		free_ioengine(td);
+	}
+
+	td->io_ops = load_ioengine(td);
 	if (!td->io_ops) {
-		log_err("fio: failed to load engine %s\n", engine);
+		log_err("fio: failed to load engine\n");
 		return 1;
 	}
 
@@ -1037,7 +1085,7 @@
 		 */
 		if (origeo) {
 			memcpy(td->eo, origeo, td->io_ops->option_struct_size);
-			options_mem_dupe(td->eo, td->io_ops->options);
+			options_mem_dupe(td->io_ops->options, td->eo);
 		} else {
 			memset(td->eo, 0, td->io_ops->option_struct_size);
 			fill_default_options(td->eo, td->io_ops->options);
@@ -1081,6 +1129,9 @@
 
 	if (o->verify_async || o->io_submit_mode == IO_MODE_OFFLOAD)
 		td->flags |= TD_F_NEED_LOCK;
+
+	if (o->mem_type == MEM_CUDA_MALLOC)
+		td->flags &= ~TD_F_SCRAMBLE_BUFFERS;
 }
 
 static int setup_random_seeds(struct thread_data *td)
@@ -1088,8 +1139,12 @@
 	unsigned long seed;
 	unsigned int i;
 
-	if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed))
-		return init_random_state(td, td->rand_seeds, sizeof(td->rand_seeds));
+	if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) {
+		int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds));
+		if (!ret)
+			td_fill_rand_seeds(td);
+		return ret;
+	}
 
 	seed = td->o.rand_seed;
 	for (i = 0; i < 4; i++)
@@ -1131,7 +1186,7 @@
 
 	if (!o->filename_format || !strlen(o->filename_format)) {
 		sprintf(buf, "%s.%d.%d", jobname, jobnum, filenum);
-		return NULL;
+		return buf;
 	}
 
 	for (f = &fpre_keywords[0]; f->keyword; f++)
@@ -1360,6 +1415,7 @@
 	td->mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
 
 	td->ts.clat_percentiles = o->clat_percentiles;
+	td->ts.lat_percentiles = o->lat_percentiles;
 	td->ts.percentile_precision = o->percentile_precision;
 	memcpy(td->ts.percentile_list, o->percentile_list, sizeof(o->percentile_list));
 
@@ -1368,6 +1424,7 @@
 		td->ts.slat_stat[i].min_val = ULONG_MAX;
 		td->ts.lat_stat[i].min_val = ULONG_MAX;
 		td->ts.bw_stat[i].min_val = ULONG_MAX;
+		td->ts.iops_stat[i].min_val = ULONG_MAX;
 	}
 	td->ddir_seq_nr = o->ddir_seq_nr;
 
@@ -1384,7 +1441,7 @@
 	prev_group_jobs++;
 
 	if (setup_random_seeds(td)) {
-		td_verror(td, errno, "init_random_state");
+		td_verror(td, errno, "setup_random_seeds");
 		goto err;
 	}
 
@@ -1528,15 +1585,16 @@
 			if (!td_ioengine_flagged(td, FIO_NOIO)) {
 				char *c1, *c2, *c3, *c4;
 				char *c5 = NULL, *c6 = NULL;
+				int i2p = is_power_of_2(o->kb_base);
 
-				c1 = fio_uint_to_kmg(o->min_bs[DDIR_READ]);
-				c2 = fio_uint_to_kmg(o->max_bs[DDIR_READ]);
-				c3 = fio_uint_to_kmg(o->min_bs[DDIR_WRITE]);
-				c4 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]);
+				c1 = num2str(o->min_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE);
+				c2 = num2str(o->max_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE);
+				c3 = num2str(o->min_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE);
+				c4 = num2str(o->max_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE);
 
 				if (!o->bs_is_seq_rand) {
-					c5 = fio_uint_to_kmg(o->min_bs[DDIR_TRIM]);
-					c6 = fio_uint_to_kmg(o->max_bs[DDIR_TRIM]);
+					c5 = num2str(o->min_bs[DDIR_TRIM], 4, 1, i2p, N2S_BYTE);
+					c6 = num2str(o->max_bs[DDIR_TRIM], 4, 1, i2p, N2S_BYTE);
 				}
 
 				log_info("%s: (g=%d): rw=%s, ", td->o.name,
@@ -1544,10 +1602,10 @@
 							ddir_str(o->td_ddir));
 
 				if (o->bs_is_seq_rand)
-					log_info("bs(seq/rand)=%s-%s/%s-%s, ",
+					log_info("bs=(R) %s-%s, (W) %s-%s, bs_is_seq_rand, ",
 							c1, c2, c3, c4);
 				else
-					log_info("bs=%s-%s/%s-%s/%s-%s, ",
+					log_info("bs=(R) %s-%s, (W) %s-%s, (T) %s-%s, ",
 							c1, c2, c3, c4, c5, c6);
 
 				log_info("ioengine=%s, iodepth=%u\n",
@@ -1573,7 +1631,7 @@
 	 */
 	numjobs = o->numjobs;
 	while (--numjobs) {
-		struct thread_data *td_new = get_new_job(0, td, 1, jobname);
+		struct thread_data *td_new = get_new_job(false, td, true, jobname);
 
 		if (!td_new)
 			goto err;
@@ -1634,11 +1692,11 @@
 			sprintf(jobname, "%s", o[i] + 5);
 		}
 		if (in_global && !td_parent)
-			td_parent = get_new_job(1, &def_thread, 0, jobname);
+			td_parent = get_new_job(true, &def_thread, false, jobname);
 		else if (!in_global && !td) {
 			if (!td_parent)
 				td_parent = &def_thread;
-			td = get_new_job(0, td_parent, 0, jobname);
+			td = get_new_job(false, td_parent, false, jobname);
 		}
 		if (in_global)
 			fio_options_parse(td_parent, (char **) &o[i], 1);
@@ -1690,7 +1748,7 @@
 		char *file, int is_buf, int stonewall_flag, int type,
 		int nested, char *name, char ***popts, int *aopts, int *nopts)
 {
-	unsigned int global = 0;
+	bool global = false;
 	char *string;
 	FILE *f;
 	char *p;
@@ -1799,7 +1857,7 @@
 				first_sect = 0;
 			}
 
-			td = get_new_job(global, &def_thread, 0, name);
+			td = get_new_job(global, &def_thread, false, name);
 			if (!td) {
 				ret = 1;
 				break;
@@ -2003,6 +2061,11 @@
 #endif
 }
 
+/*
+ * Following options aren't printed by usage().
+ * --append-terse - Equivalent to --output-format=terse, see f6a7df53.
+ * --latency-log - Deprecated option.
+ */
 static void usage(const char *name)
 {
 	printf("%s\n", fio_version_string);
@@ -2011,15 +2074,15 @@
 	show_debug_categories();
 	printf("  --parse-only\t\tParse options only, don't start any IO\n");
 	printf("  --output\t\tWrite output to file\n");
-	printf("  --runtime\t\tRuntime in seconds\n");
 	printf("  --bandwidth-log\tGenerate aggregate bandwidth logs\n");
 	printf("  --minimal\t\tMinimal (terse) output\n");
-	printf("  --output-format=x\tOutput format (terse,json,json+,normal)\n");
-	printf("  --terse-version=x\tSet terse version output format to 'x'\n");
+	printf("  --output-format=type\tOutput format (terse,json,json+,normal)\n");
+	printf("  --terse-version=type\tSet terse version output format"
+		" (default 3, or 2 or 4)\n");
 	printf("  --version\t\tPrint version info and exit\n");
 	printf("  --help\t\tPrint this page\n");
 	printf("  --cpuclock-test\tPerform test/validation of CPU clock\n");
-	printf("  --crctest\t\tTest speed of checksum functions\n");
+	printf("  --crctest=[type]\tTest speed of checksum functions\n");
 	printf("  --cmdhelp=cmd\t\tPrint command help, \"all\" for all of"
 		" them\n");
 	printf("  --enghelp=engine\tPrint ioengine help, or list"
@@ -2035,14 +2098,15 @@
 	printf(" 't' period passed\n");
 	printf("  --readonly\t\tTurn on safety read-only checks, preventing"
 		" writes\n");
-	printf("  --section=name\tOnly run specified section in job file\n");
+	printf("  --section=name\tOnly run specified section in job file,"
+		" multiple sections can be specified\n");
 	printf("  --alloc-size=kb\tSet smalloc pool to this size in kb"
-		" (def 1024)\n");
+		" (def 16384)\n");
 	printf("  --warnings-fatal\tFio parser warnings are fatal\n");
 	printf("  --max-jobs=nr\t\tMaximum number of threads/processes to support\n");
 	printf("  --server=args\t\tStart a backend fio server\n");
 	printf("  --daemonize=pidfile\tBackground fio server, write pid to file\n");
-	printf("  --client=hostname\tTalk to remote backend fio server at hostname\n");
+	printf("  --client=hostname\tTalk to remote backend(s) fio server at hostname\n");
 	printf("  --remote-config=file\tTell fio server to load this local job file\n");
 	printf("  --idle-prof=option\tReport cpu idleness on a system or percpu basis\n"
 		"\t\t\t(option=system,percpu) or run unit work\n"
@@ -2051,7 +2115,7 @@
 	printf("  --inflate-log=log\tInflate and output compressed log\n");
 #endif
 	printf("  --trigger-file=file\tExecute trigger cmd when file exists\n");
-	printf("  --trigger-timeout=t\tExecute trigger af this time\n");
+	printf("  --trigger-timeout=t\tExecute trigger at this time\n");
 	printf("  --trigger=cmd\t\tSet this command as local trigger\n");
 	printf("  --trigger-remote=cmd\tSet this command as remote trigger\n");
 	printf("  --aux-path=path\tUse this path for fio state generated files\n");
@@ -2325,13 +2389,6 @@
 			smalloc_pool_size <<= 10;
 			sinit();
 			break;
-		case 't':
-			if (check_str_time(optarg, &def_timeout, 1)) {
-				log_err("fio: failed parsing time %s\n", optarg);
-				do_exit++;
-				exit_val = 1;
-			}
-			break;
 		case 'l':
 			log_err("fio: --latency-log is deprecated. Use per-job latency log options.\n");
 			do_exit++;
@@ -2340,17 +2397,22 @@
 		case 'b':
 			write_bw_log = 1;
 			break;
-		case 'o':
+		case 'o': {
+			FILE *tmp;
+
 			if (f_out && f_out != stdout)
 				fclose(f_out);
 
-			f_out = fopen(optarg, "w+");
-			if (!f_out) {
-				perror("fopen output");
-				exit(1);
+			tmp = fopen(optarg, "w+");
+			if (!tmp) {
+				log_err("fio: output file open error: %s\n", strerror(errno));
+				exit_val = 1;
+				do_exit++;
+				break;
 			}
-			f_err = f_out;
+			f_err = f_out = tmp;
 			break;
+			}
 		case 'm':
 			output_format = FIO_OUTPUT_TERSE;
 			break;
@@ -2402,8 +2464,7 @@
 			break;
 		case 'V':
 			terse_version = atoi(optarg);
-			if (!(terse_version == 2 || terse_version == 3 ||
-			     terse_version == 4)) {
+			if (!(terse_version >= 2 && terse_version <= 5)) {
 				log_err("fio: bad terse version format\n");
 				exit_val = 1;
 				do_exit++;
@@ -2484,7 +2545,7 @@
 				if (is_section && skip_this_section(val))
 					continue;
 
-				td = get_new_job(global, &def_thread, 1, NULL);
+				td = get_new_job(global, &def_thread, true, NULL);
 				if (!td || ioengine_load(td)) {
 					if (td) {
 						put_job(td);
@@ -2514,7 +2575,6 @@
 			}
 
 			if (!ret && !strcmp(opt, "ioengine")) {
-				free_ioengine(td);
 				if (ioengine_load(td)) {
 					put_job(td);
 					td = NULL;
@@ -2722,7 +2782,7 @@
 		if (!ret) {
 			ret = add_job(td, td->o.name ?: "fio", 0, 0, client_type);
 			if (ret)
-				did_arg = 1;
+				exit(1);
 		}
 	}
 
@@ -2734,9 +2794,6 @@
 	}
 
 out_free:
-	if (pid_file)
-		free(pid_file);
-
 	return ini_idx;
 }
 
@@ -2805,7 +2862,7 @@
 		if (did_arg)
 			return 0;
 
-		log_err("No jobs(s) defined\n\n");
+		log_err("No job(s) defined\n\n");
 
 		if (!did_arg) {
 			usage(argv[0]);
diff -Nru fio-2.16/io_ddir.h fio-3.1/io_ddir.h
--- fio-2.16/io_ddir.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/io_ddir.h	2017-09-28 10:23:20.000000000 +0000
@@ -61,9 +61,9 @@
 
 static inline const char *ddir_str(enum td_ddir ddir)
 {
-	static const char *__str[] = { NULL, "read", "write", "rw", NULL,
+	static const char *__str[] = { NULL, "read", "write", "rw", "rand",
 				"randread", "randwrite", "randrw",
-				"trim", NULL, NULL, NULL, "randtrim" };
+				"trim", NULL, "trimwrite", NULL, "randtrim" };
 
 	return __str[ddir];
 }
diff -Nru fio-2.16/ioengine.h fio-3.1/ioengine.h
--- fio-2.16/ioengine.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/ioengine.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,265 +0,0 @@
-#ifndef FIO_IOENGINE_H
-#define FIO_IOENGINE_H
-
-#include "compiler/compiler.h"
-#include "os/os.h"
-#include "log.h"
-#include "io_ddir.h"
-#include "debug.h"
-#include "file.h"
-#include "workqueue.h"
-
-#ifdef CONFIG_LIBAIO
-#include <libaio.h>
-#endif
-#ifdef CONFIG_GUASI
-#include <guasi.h>
-#endif
-
-#define FIO_IOOPS_VERSION	23
-
-enum {
-	IO_U_F_FREE		= 1 << 0,
-	IO_U_F_FLIGHT		= 1 << 1,
-	IO_U_F_NO_FILE_PUT	= 1 << 2,
-	IO_U_F_IN_CUR_DEPTH	= 1 << 3,
-	IO_U_F_BUSY_OK		= 1 << 4,
-	IO_U_F_TRIMMED		= 1 << 5,
-	IO_U_F_BARRIER		= 1 << 6,
-	IO_U_F_VER_LIST		= 1 << 7,
-};
-
-/*
- * The io unit
- */
-struct io_u {
-	struct timeval start_time;
-	struct timeval issue_time;
-
-	struct fio_file *file;
-	unsigned int flags;
-	enum fio_ddir ddir;
-
-	/*
-	 * For replay workloads, we may want to account as a different
-	 * IO type than what is being submitted.
-	 */
-	enum fio_ddir acct_ddir;
-
-	/*
-	 * Write generation
-	 */
-	unsigned short numberio;
-
-	/*
-	 * Allocated/set buffer and length
-	 */
-	unsigned long buflen;
-	unsigned long long offset;
-	void *buf;
-
-	/*
-	 * Initial seed for generating the buffer contents
-	 */
-	uint64_t rand_seed;
-
-	/*
-	 * IO engine state, may be different from above when we get
-	 * partial transfers / residual data counts
-	 */
-	void *xfer_buf;
-	unsigned long xfer_buflen;
-
-	/*
-	 * Parameter related to pre-filled buffers and
-	 * their size to handle variable block sizes.
-	 */
-	unsigned long buf_filled_len;
-
-	struct io_piece *ipo;
-
-	unsigned int resid;
-	unsigned int error;
-
-	/*
-	 * io engine private data
-	 */
-	union {
-		unsigned int index;
-		unsigned int seen;
-		void *engine_data;
-	};
-
-	union {
-		struct flist_head verify_list;
-		struct workqueue_work work;
-	};
-
-	/*
-	 * Callback for io completion
-	 */
-	int (*end_io)(struct thread_data *, struct io_u **);
-
-	union {
-#ifdef CONFIG_LIBAIO
-		struct iocb iocb;
-#endif
-#ifdef CONFIG_POSIXAIO
-		os_aiocb_t aiocb;
-#endif
-#ifdef FIO_HAVE_SGIO
-		struct sg_io_hdr hdr;
-#endif
-#ifdef CONFIG_GUASI
-		guasi_req_t greq;
-#endif
-#ifdef CONFIG_SOLARISAIO
-		aio_result_t resultp;
-#endif
-#ifdef FIO_HAVE_BINJECT
-		struct b_user_cmd buc;
-#endif
-#ifdef CONFIG_RDMA
-		struct ibv_mr *mr;
-#endif
-		void *mmap_data;
-		uint64_t null;
-	};
-};
-
-/*
- * io_ops->queue() return values
- */
-enum {
-	FIO_Q_COMPLETED	= 0,		/* completed sync */
-	FIO_Q_QUEUED	= 1,		/* queued, will complete async */
-	FIO_Q_BUSY	= 2,		/* no more room, call ->commit() */
-};
-
-struct ioengine_ops {
-	struct flist_head list;
-	const char *name;
-	int version;
-	int flags;
-	int (*setup)(struct thread_data *);
-	int (*init)(struct thread_data *);
-	int (*prep)(struct thread_data *, struct io_u *);
-	int (*queue)(struct thread_data *, struct io_u *);
-	int (*commit)(struct thread_data *);
-	int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
-	struct io_u *(*event)(struct thread_data *, int);
-	char *(*errdetails)(struct io_u *);
-	int (*cancel)(struct thread_data *, struct io_u *);
-	void (*cleanup)(struct thread_data *);
-	int (*open_file)(struct thread_data *, struct fio_file *);
-	int (*close_file)(struct thread_data *, struct fio_file *);
-	int (*invalidate)(struct thread_data *, struct fio_file *);
-	int (*unlink_file)(struct thread_data *, struct fio_file *);
-	int (*get_file_size)(struct thread_data *, struct fio_file *);
-	void (*terminate)(struct thread_data *);
-	int (*iomem_alloc)(struct thread_data *, size_t);
-	void (*iomem_free)(struct thread_data *);
-	int (*io_u_init)(struct thread_data *, struct io_u *);
-	void (*io_u_free)(struct thread_data *, struct io_u *);
-	int option_struct_size;
-	struct fio_option *options;
-};
-
-enum fio_ioengine_flags {
-	FIO_SYNCIO	= 1 << 0,	/* io engine has synchronous ->queue */
-	FIO_RAWIO	= 1 << 1,	/* some sort of direct/raw io */
-	FIO_DISKLESSIO	= 1 << 2,	/* no disk involved */
-	FIO_NOEXTEND	= 1 << 3,	/* engine can't extend file */
-	FIO_NODISKUTIL  = 1 << 4,	/* diskutil can't handle filename */
-	FIO_UNIDIR	= 1 << 5,	/* engine is uni-directional */
-	FIO_NOIO	= 1 << 6,	/* thread does only pseudo IO */
-	FIO_PIPEIO	= 1 << 7,	/* input/output no seekable */
-	FIO_BARRIER	= 1 << 8,	/* engine supports barriers */
-	FIO_MEMALIGN	= 1 << 9,	/* engine wants aligned memory */
-	FIO_BIT_BASED	= 1 << 10,	/* engine uses a bit base (e.g. uses Kbit as opposed to KB) */
-	FIO_FAKEIO	= 1 << 11,	/* engine pretends to do IO */
-};
-
-/*
- * External engine defined symbol to fill in the engine ops structure
- */
-typedef void (*get_ioengine_t)(struct ioengine_ops **);
-
-/*
- * io engine entry points
- */
-extern int __must_check td_io_init(struct thread_data *);
-extern int __must_check td_io_prep(struct thread_data *, struct io_u *);
-extern int __must_check td_io_queue(struct thread_data *, struct io_u *);
-extern int __must_check td_io_sync(struct thread_data *, struct fio_file *);
-extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
-extern int __must_check td_io_commit(struct thread_data *);
-extern int __must_check td_io_open_file(struct thread_data *, struct fio_file *);
-extern int td_io_close_file(struct thread_data *, struct fio_file *);
-extern int td_io_unlink_file(struct thread_data *, struct fio_file *);
-extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *);
-
-extern struct ioengine_ops *load_ioengine(struct thread_data *, const char *);
-extern void register_ioengine(struct ioengine_ops *);
-extern void unregister_ioengine(struct ioengine_ops *);
-extern void free_ioengine(struct thread_data *);
-extern void close_ioengine(struct thread_data *);
-
-extern int fio_show_ioengine_help(const char *engine);
-
-/*
- * io unit handling
- */
-extern struct io_u *__get_io_u(struct thread_data *);
-extern struct io_u *get_io_u(struct thread_data *);
-extern void put_io_u(struct thread_data *, struct io_u *);
-extern void clear_io_u(struct thread_data *, struct io_u *);
-extern void requeue_io_u(struct thread_data *, struct io_u **);
-extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *);
-extern int __must_check io_u_queued_complete(struct thread_data *, int);
-extern void io_u_queued(struct thread_data *, struct io_u *);
-extern int io_u_quiesce(struct thread_data *);
-extern void io_u_log_error(struct thread_data *, struct io_u *);
-extern void io_u_mark_depth(struct thread_data *, unsigned int);
-extern void fill_io_buffer(struct thread_data *, void *, unsigned int, unsigned int);
-extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int);
-void io_u_mark_complete(struct thread_data *, unsigned int);
-void io_u_mark_submit(struct thread_data *, unsigned int);
-bool queue_full(const struct thread_data *);
-
-int do_io_u_sync(const struct thread_data *, struct io_u *);
-int do_io_u_trim(const struct thread_data *, struct io_u *);
-
-#ifdef FIO_INC_DEBUG
-static inline void dprint_io_u(struct io_u *io_u, const char *p)
-{
-	struct fio_file *f = io_u->file;
-
-	dprint(FD_IO, "%s: io_u %p: off=%llu/len=%lu/ddir=%d", p, io_u,
-					(unsigned long long) io_u->offset,
-					io_u->buflen, io_u->ddir);
-	if (fio_debug & (1 << FD_IO)) {
-		if (f)
-			log_info("/%s", f->file_name);
-
-		log_info("\n");
-	}
-}
-#else
-#define dprint_io_u(io_u, p)
-#endif
-
-static inline enum fio_ddir acct_ddir(struct io_u *io_u)
-{
-	if (io_u->acct_ddir != -1)
-		return io_u->acct_ddir;
-
-	return io_u->ddir;
-}
-
-#define io_u_clear(td, io_u, val)	\
-	td_flags_clear((td), &(io_u->flags), (val))
-#define io_u_set(td, io_u, val)		\
-	td_flags_set((td), &(io_u)->flags, (val))
-
-#endif
diff -Nru fio-2.16/ioengines.c fio-3.1/ioengines.c
--- fio-2.16/ioengines.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/ioengines.c	2017-09-28 10:23:20.000000000 +0000
@@ -123,13 +123,10 @@
 	return ops;
 }
 
-struct ioengine_ops *load_ioengine(struct thread_data *td, const char *name)
+static struct ioengine_ops *__load_ioengine(const char *name)
 {
-	struct ioengine_ops *ops;
 	char engine[64];
 
-	dprint(FD_IO, "load ioengine %s\n", name);
-
 	engine[sizeof(engine) - 1] = '\0';
 	strncpy(engine, name, sizeof(engine) - 1);
 
@@ -139,10 +136,37 @@
 	if (!strncmp(engine, "linuxaio", 8) || !strncmp(engine, "aio", 3))
 		strcpy(engine, "libaio");
 
-	ops = find_ioengine(engine);
+	dprint(FD_IO, "load ioengine %s\n", engine);
+	return find_ioengine(engine);
+}
+
+struct ioengine_ops *load_ioengine(struct thread_data *td)
+{
+	struct ioengine_ops *ops = NULL;
+	const char *name;
+
+	/*
+	 * Use ->ioengine_so_path if an external ioengine path is specified.
+	 * In this case, ->ioengine is "external" which also means the prefix
+	 * for external ioengines "external:" is properly used.
+	 */
+	name = td->o.ioengine_so_path ?: td->o.ioengine;
+
+	/*
+	 * Try to load ->ioengine first, and if failed try to dlopen(3) either
+	 * ->ioengine or ->ioengine_so_path.  This is redundant for an external
+	 * ioengine with prefix, and also leaves the possibility of unexpected
+	 * behavior (e.g. if the "external" ioengine exists), but we do this
+	 * so as not to break job files not using the prefix.
+	 */
+	ops = __load_ioengine(td->o.ioengine);
 	if (!ops)
 		ops = dlopen_ioengine(td, name);
 
+	/*
+	 * If ops is NULL, we failed to load ->ioengine, and also failed to
+	 * dlopen(3) either ->ioengine or ->ioengine_so_path as a path.
+	 */
 	if (!ops) {
 		log_err("fio: engine %s not loadable\n", name);
 		return NULL;
@@ -281,7 +305,7 @@
 		 */
 		if (td->o.read_iolog_file)
 			memcpy(&td->last_issue, &io_u->issue_time,
-					sizeof(struct timeval));
+					sizeof(io_u->issue_time));
 	}
 
 	if (ddir_rw(ddir)) {
@@ -318,8 +342,8 @@
 	    td->o.odirect) {
 
 		log_info("fio: first direct IO errored. File system may not "
-			 "support direct IO, or iomem_align= is bad. Try "
-			 "setting direct=0.\n");
+			 "support direct IO, or iomem_align= is bad, or "
+			 "invalid block size. Try setting direct=0.\n");
 	}
 
 	if (!td->io_ops->commit || io_u->ddir == DDIR_TRIM) {
@@ -356,7 +380,7 @@
 		 */
 		if (td->o.read_iolog_file)
 			memcpy(&td->last_issue, &io_u->issue_time,
-					sizeof(struct timeval));
+					sizeof(io_u->issue_time));
 	}
 
 	return ret;
@@ -368,17 +392,17 @@
 
 	if (td->io_ops->init) {
 		ret = td->io_ops->init(td);
-		if (ret && td->o.iodepth > 1) {
-			log_err("fio: io engine init failed. Perhaps try"
-				" reducing io depth?\n");
-		}
+		if (ret)
+			log_err("fio: io engine %s init failed.%s\n",
+				td->io_ops->name,
+				td->o.iodepth > 1 ?
+				" Perhaps try reducing io depth?" : "");
+		else
+			td->io_ops_init = 1;
 		if (!td->error)
 			td->error = ret;
 	}
 
-	if (!ret && td_ioengine_flagged(td, FIO_NOIO))
-		td->flags |= TD_F_NOIO;
-
 	return ret;
 }
 
@@ -449,7 +473,7 @@
 		goto err;
 
 	if (td->o.fadvise_hint != F_ADV_NONE &&
-	    (f->filetype == FIO_TYPE_BD || f->filetype == FIO_TYPE_FILE)) {
+	    (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
 		int flags;
 
 		if (td->o.fadvise_hint == F_ADV_TYPE) {
@@ -472,39 +496,32 @@
 			goto err;
 		}
 	}
-#ifdef FIO_HAVE_STREAMID
-	if (td->o.fadvise_stream &&
-	    (f->filetype == FIO_TYPE_BD || f->filetype == FIO_TYPE_FILE)) {
-		off_t stream = td->o.fadvise_stream;
-
-		if (posix_fadvise(f->fd, stream, f->io_size, POSIX_FADV_STREAMID) < 0) {
-			td_verror(td, errno, "fadvise streamid");
-			goto err;
-		}
-	}
-#endif
-
-#ifdef FIO_OS_DIRECTIO
-	/*
-	 * Some OS's have a distinct call to mark the file non-buffered,
-	 * instead of using O_DIRECT (Solaris)
-	 */
-	if (td->o.odirect) {
-		int ret = fio_set_odirect(f->fd);
+#ifdef FIO_HAVE_WRITE_HINT
+	if (fio_option_is_set(&td->o, write_hint) &&
+	    (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
+		uint64_t hint = td->o.write_hint;
+		int cmd;
 
-		if (ret) {
-			td_verror(td, ret, "fio_set_odirect");
-			if (ret == ENOTTY) { /* ENOTTY suggests RAW device or ZFS */
-				log_err("fio: doing directIO to RAW devices or ZFS not supported\n");
-			} else {
-				log_err("fio: the file system does not seem to support direct IO\n");
-			}
+		/*
+		 * For direct IO, we just need/want to set the hint on
+		 * the file descriptor. For buffered IO, we need to set
+		 * it on the inode.
+		 */
+		if (td->o.odirect)
+			cmd = F_SET_FILE_RW_HINT;
+		else
+			cmd = F_SET_RW_HINT;
 
+		if (fcntl(f->fd, cmd, &hint) < 0) {
+			td_verror(td, errno, "fcntl write hint");
 			goto err;
 		}
 	}
 #endif
 
+	if (td->o.odirect && !OS_O_DIRECT && fio_set_directio(td, f))
+		goto err;
+
 done:
 	log_file(td, f, FIO_LOG_OPEN_FILE);
 	return 0;
@@ -556,77 +573,18 @@
 	return td->io_ops->get_file_size(td, f);
 }
 
-static int do_sync_file_range(const struct thread_data *td,
-			      struct fio_file *f)
-{
-	off64_t offset, nbytes;
-
-	offset = f->first_write;
-	nbytes = f->last_write - f->first_write;
-
-	if (!nbytes)
-		return 0;
-
-	return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range);
-}
-
-int do_io_u_sync(const struct thread_data *td, struct io_u *io_u)
-{
-	int ret;
-
-	if (io_u->ddir == DDIR_SYNC) {
-		ret = fsync(io_u->file->fd);
-	} else if (io_u->ddir == DDIR_DATASYNC) {
-#ifdef CONFIG_FDATASYNC
-		ret = fdatasync(io_u->file->fd);
-#else
-		ret = io_u->xfer_buflen;
-		io_u->error = EINVAL;
-#endif
-	} else if (io_u->ddir == DDIR_SYNC_FILE_RANGE)
-		ret = do_sync_file_range(td, io_u->file);
-	else {
-		ret = io_u->xfer_buflen;
-		io_u->error = EINVAL;
-	}
-
-	if (ret < 0)
-		io_u->error = errno;
-
-	return ret;
-}
-
-int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
-{
-#ifndef FIO_HAVE_TRIM
-	io_u->error = EINVAL;
-	return 0;
-#else
-	struct fio_file *f = io_u->file;
-	int ret;
-
-	ret = os_trim(f->fd, io_u->offset, io_u->xfer_buflen);
-	if (!ret)
-		return io_u->xfer_buflen;
-
-	io_u->error = ret;
-	return 0;
-#endif
-}
-
 int fio_show_ioengine_help(const char *engine)
 {
 	struct flist_head *entry;
-	struct thread_data td;
+	struct ioengine_ops *io_ops;
 	char *sep;
 	int ret = 1;
 
 	if (!engine || !*engine) {
 		log_info("Available IO engines:\n");
 		flist_for_each(entry, &engine_list) {
-			td.io_ops = flist_entry(entry, struct ioengine_ops,
-						list);
-			log_info("\t%s\n", td.io_ops->name);
+			io_ops = flist_entry(entry, struct ioengine_ops, list);
+			log_info("\t%s\n", io_ops->name);
 		}
 		return 0;
 	}
@@ -636,20 +594,16 @@
 		sep++;
 	}
 
-	memset(&td, 0, sizeof(td));
-
-	td.io_ops = load_ioengine(&td, engine);
-	if (!td.io_ops) {
+	io_ops = __load_ioengine(engine);
+	if (!io_ops) {
 		log_info("IO engine %s not found\n", engine);
 		return 1;
 	}
 
-	if (td.io_ops->options)
-		ret = show_cmd_help(td.io_ops->options, sep);
+	if (io_ops->options)
+		ret = show_cmd_help(io_ops->options, sep);
 	else
-		log_info("IO engine %s has no options\n", td.io_ops->name);
-
-	free_ioengine(&td);
+		log_info("IO engine %s has no options\n", io_ops->name);
 
 	return ret;
 }
diff -Nru fio-2.16/ioengines.h fio-3.1/ioengines.h
--- fio-2.16/ioengines.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/ioengines.h	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,90 @@
+#ifndef FIO_IOENGINE_H
+#define FIO_IOENGINE_H
+
+#include "compiler/compiler.h"
+#include "os/os.h"
+#include "file.h"
+#include "io_u.h"
+
+#define FIO_IOOPS_VERSION	23
+
+/*
+ * io_ops->queue() return values
+ */
+enum {
+	FIO_Q_COMPLETED	= 0,		/* completed sync */
+	FIO_Q_QUEUED	= 1,		/* queued, will complete async */
+	FIO_Q_BUSY	= 2,		/* no more room, call ->commit() */
+};
+
+struct ioengine_ops {
+	struct flist_head list;
+	const char *name;
+	int version;
+	int flags;
+	int (*setup)(struct thread_data *);
+	int (*init)(struct thread_data *);
+	int (*prep)(struct thread_data *, struct io_u *);
+	int (*queue)(struct thread_data *, struct io_u *);
+	int (*commit)(struct thread_data *);
+	int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
+	struct io_u *(*event)(struct thread_data *, int);
+	char *(*errdetails)(struct io_u *);
+	int (*cancel)(struct thread_data *, struct io_u *);
+	void (*cleanup)(struct thread_data *);
+	int (*open_file)(struct thread_data *, struct fio_file *);
+	int (*close_file)(struct thread_data *, struct fio_file *);
+	int (*invalidate)(struct thread_data *, struct fio_file *);
+	int (*unlink_file)(struct thread_data *, struct fio_file *);
+	int (*get_file_size)(struct thread_data *, struct fio_file *);
+	void (*terminate)(struct thread_data *);
+	int (*iomem_alloc)(struct thread_data *, size_t);
+	void (*iomem_free)(struct thread_data *);
+	int (*io_u_init)(struct thread_data *, struct io_u *);
+	void (*io_u_free)(struct thread_data *, struct io_u *);
+	int option_struct_size;
+	struct fio_option *options;
+};
+
+enum fio_ioengine_flags {
+	FIO_SYNCIO	= 1 << 0,	/* io engine has synchronous ->queue */
+	FIO_RAWIO	= 1 << 1,	/* some sort of direct/raw io */
+	FIO_DISKLESSIO	= 1 << 2,	/* no disk involved */
+	FIO_NOEXTEND	= 1 << 3,	/* engine can't extend file */
+	FIO_NODISKUTIL  = 1 << 4,	/* diskutil can't handle filename */
+	FIO_UNIDIR	= 1 << 5,	/* engine is uni-directional */
+	FIO_NOIO	= 1 << 6,	/* thread does only pseudo IO */
+	FIO_PIPEIO	= 1 << 7,	/* input/output no seekable */
+	FIO_BARRIER	= 1 << 8,	/* engine supports barriers */
+	FIO_MEMALIGN	= 1 << 9,	/* engine wants aligned memory */
+	FIO_BIT_BASED	= 1 << 10,	/* engine uses a bit base (e.g. uses Kbit as opposed to KB) */
+	FIO_FAKEIO	= 1 << 11,	/* engine pretends to do IO */
+};
+
+/*
+ * External engine defined symbol to fill in the engine ops structure
+ */
+typedef void (*get_ioengine_t)(struct ioengine_ops **);
+
+/*
+ * io engine entry points
+ */
+extern int __must_check td_io_init(struct thread_data *);
+extern int __must_check td_io_prep(struct thread_data *, struct io_u *);
+extern int __must_check td_io_queue(struct thread_data *, struct io_u *);
+extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
+extern int __must_check td_io_commit(struct thread_data *);
+extern int __must_check td_io_open_file(struct thread_data *, struct fio_file *);
+extern int td_io_close_file(struct thread_data *, struct fio_file *);
+extern int td_io_unlink_file(struct thread_data *, struct fio_file *);
+extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *);
+
+extern struct ioengine_ops *load_ioengine(struct thread_data *);
+extern void register_ioengine(struct ioengine_ops *);
+extern void unregister_ioengine(struct ioengine_ops *);
+extern void free_ioengine(struct thread_data *);
+extern void close_ioengine(struct thread_data *);
+
+extern int fio_show_ioengine_help(const char *engine);
+
+#endif
diff -Nru fio-2.16/iolog.c fio-3.1/iolog.c
--- fio-2.16/iolog.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/iolog.c	2017-09-28 10:23:20.000000000 +0000
@@ -19,6 +19,7 @@
 #include "trim.h"
 #include "filelock.h"
 #include "smalloc.h"
+#include "blktrace.h"
 
 static int iolog_flush(struct io_log *log);
 
@@ -64,7 +65,7 @@
 {
 	uint64_t usec = utime_since_now(&td->last_issue);
 	uint64_t this_delay;
-	struct timeval tv;
+	struct timespec ts;
 
 	if (delay < td->time_offset) {
 		td->time_offset = 0;
@@ -77,7 +78,7 @@
 
 	delay -= usec;
 
-	fio_gettime(&tv, NULL);
+	fio_gettime(&ts, NULL);
 	while (delay && !td->terminate) {
 		this_delay = delay;
 		if (this_delay > 500000)
@@ -87,7 +88,7 @@
 		delay -= this_delay;
 	}
 
-	usec = utime_since_now(&tv);
+	usec = utime_since_now(&ts);
 	if (usec > delay)
 		td->time_offset = usec - delay;
 	else
@@ -226,21 +227,16 @@
 	}
 
 	/*
-	 * We don't need to sort the entries, if:
+	 * We don't need to sort the entries if we only performed sequential
+	 * writes. In this case, just reading back data in the order we wrote
+	 * it out is the faster but still safe.
 	 *
-	 *	Sequential writes, or
-	 *	Random writes that lay out the file as it goes along
-	 *
-	 * For both these cases, just reading back data in the order we
-	 * wrote it out is the fastest.
-	 *
-	 * One exception is if we don't have a random map AND we are doing
-	 * verifies, in that case we need to check for duplicate blocks and
-	 * drop the old one, which we rely on the rb insert/lookup for
-	 * handling.
+	 * One exception is if we don't have a random map in which case we need
+	 * to check for duplicate blocks and drop the old one, which we rely on
+	 * the rb insert/lookup for handling.
 	 */
-	if (((!td->o.verifysort) || !td_random(td) || !td->o.overwrite) &&
-	      (file_randommap(td, ipo->file) || td->o.verify == VERIFY_NONE)) {
+	if (((!td->o.verifysort) || !td_random(td)) &&
+	      file_randommap(td, ipo->file)) {
 		INIT_FLIST_HEAD(&ipo->list);
 		flist_add_tail(&ipo->list, &td->io_hist_list);
 		ipo->flags |= IP_F_ONLIST;
@@ -277,13 +273,14 @@
 			overlap = 1;
 
 		if (overlap) {
-			dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu",
+			dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu\n",
 				__ipo->offset, __ipo->len,
 				ipo->offset, ipo->len);
 			td->io_hist_len--;
 			rb_erase(parent, &td->io_hist_tree);
 			remove_trim_entry(td, __ipo);
-			free(__ipo);
+			if (!(__ipo->flags & IP_F_IN_FLIGHT))
+				free(__ipo);
 			goto restart;
 		}
 	}
@@ -422,7 +419,7 @@
 				continue;
 			}
 		} else {
-			log_err("bad iolog2: %s", p);
+			log_err("bad iolog2: %s\n", p);
 			continue;
 		}
 
@@ -642,6 +639,7 @@
 		l->log_gz = 0;
 	else if (l->log_gz || l->log_gz_store) {
 		mutex_init_pshared(&l->chunk_lock);
+		mutex_init_pshared(&l->deferred_free_lock);
 		p->td->flags |= TD_F_COMPRESS_LOG;
 	}
 
@@ -696,7 +694,7 @@
 	sfree(log);
 }
 
-inline unsigned long hist_sum(int j, int stride, unsigned int *io_u_plat,
+unsigned long hist_sum(int j, int stride, unsigned int *io_u_plat,
 		unsigned int *io_u_plat_last)
 {
 	unsigned long sum;
@@ -1143,6 +1141,42 @@
 
 #ifdef CONFIG_ZLIB
 
+static bool warned_on_drop;
+
+static void iolog_put_deferred(struct io_log *log, void *ptr)
+{
+	if (!ptr)
+		return;
+
+	pthread_mutex_lock(&log->deferred_free_lock);
+	if (log->deferred < IOLOG_MAX_DEFER) {
+		log->deferred_items[log->deferred] = ptr;
+		log->deferred++;
+	} else if (!warned_on_drop) {
+		log_err("fio: had to drop log entry free\n");
+		warned_on_drop = true;
+	}
+	pthread_mutex_unlock(&log->deferred_free_lock);
+}
+
+static void iolog_free_deferred(struct io_log *log)
+{
+	int i;
+
+	if (!log->deferred)
+		return;
+
+	pthread_mutex_lock(&log->deferred_free_lock);
+
+	for (i = 0; i < log->deferred; i++) {
+		free(log->deferred_items[i]);
+		log->deferred_items[i] = NULL;
+	}
+
+	log->deferred = 0;
+	pthread_mutex_unlock(&log->deferred_free_lock);
+}
+
 static int gz_work(struct iolog_flush_data *data)
 {
 	struct iolog_compress *c = NULL;
@@ -1235,7 +1269,7 @@
 	if (ret != Z_OK)
 		log_err("fio: deflateEnd %d\n", ret);
 
-	free(data->samples);
+	iolog_put_deferred(data->log, data->samples);
 
 	if (!flist_empty(&list)) {
 		pthread_mutex_lock(&data->log->chunk_lock);
@@ -1246,7 +1280,7 @@
 	ret = 0;
 done:
 	if (data->free)
-		free(data);
+		sfree(data);
 	return ret;
 err:
 	while (!flist_empty(&list)) {
@@ -1347,7 +1381,7 @@
 {
 	struct iolog_flush_data *data;
 
-	data = malloc(sizeof(*data));
+	data = smalloc(sizeof(*data));
 	if (!data)
 		return 1;
 
@@ -1361,6 +1395,9 @@
 	cur_log->log = NULL;
 
 	workqueue_enqueue(&log->td->log_compress_wq, &data->work);
+
+	iolog_free_deferred(log);
+
 	return 0;
 }
 #else
diff -Nru fio-2.16/iolog.h fio-3.1/iolog.h
--- fio-2.16/iolog.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/iolog.h	2017-09-28 10:23:20.000000000 +0000
@@ -4,7 +4,7 @@
 #include "lib/rbtree.h"
 #include "lib/ieee754.h"
 #include "flist.h"
-#include "ioengine.h"
+#include "ioengines.h"
 
 /*
  * Use for maintaining statistics
@@ -117,7 +117,7 @@
 	 */
 	struct io_stat avg_window[DDIR_RWDIR_CNT];
 	unsigned long avg_msec;
-	unsigned long avg_last;
+	unsigned long avg_last[DDIR_RWDIR_CNT];
 
 	/*
 	 * Windowed latency histograms, for keeping track of when we need to
@@ -131,6 +131,11 @@
 	pthread_mutex_t chunk_lock;
 	unsigned int chunk_seq;
 	struct flist_head chunk_list;
+
+	pthread_mutex_t deferred_free_lock;
+#define IOLOG_MAX_DEFER	8
+	void *deferred_items[IOLOG_MAX_DEFER];
+	unsigned int deferred;
 };
 
 /*
@@ -259,7 +264,7 @@
 
 static inline bool per_unit_log(struct io_log *log)
 {
-	return log && !log->avg_msec;
+	return log && (!log->avg_msec || log->log_gz || log->log_gz_store);
 }
 
 static inline bool inline_log(struct io_log *log)
@@ -271,7 +276,7 @@
 
 static inline void ipo_bytes_align(unsigned int replay_align, struct io_piece *ipo)
 {
-	if (replay_align)
+	if (!replay_align)
 		return;
 
 	ipo->offset &= ~(replay_align - (uint64_t)1);
diff -Nru fio-2.16/io_u.c fio-3.1/io_u.c
--- fio-2.16/io_u.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/io_u.c	2017-09-28 10:23:20.000000000 +0000
@@ -20,7 +20,7 @@
 
 	int error;			/* output */
 	uint64_t bytes_done[DDIR_RWDIR_CNT];	/* output */
-	struct timeval time;		/* output */
+	struct timespec time;		/* output */
 };
 
 /*
@@ -37,7 +37,7 @@
  */
 static void mark_random_map(struct thread_data *td, struct io_u *io_u)
 {
-	unsigned int min_bs = td->o.rw_min_bs;
+	unsigned int min_bs = td->o.min_bs[io_u->ddir];
 	struct fio_file *f = io_u->file;
 	unsigned int nr_blocks;
 	uint64_t block;
@@ -62,6 +62,7 @@
 
 	/*
 	 * Hmm, should we make sure that ->io_size <= ->real_file_size?
+	 * -> not for now since there is code assuming it could go either.
 	 */
 	max_size = f->io_size;
 	if (max_size > f->real_file_size)
@@ -532,6 +533,7 @@
 	unsigned int buflen = 0;
 	unsigned int minbs, maxbs;
 	uint64_t frand_max, r;
+	bool power_2;
 
 	assert(ddir_rw(ddir));
 
@@ -550,9 +552,9 @@
 	if (!io_u_fits(td, io_u, minbs))
 		return 0;
 
-	frand_max = rand_max(&td->bsrange_state);
+	frand_max = rand_max(&td->bsrange_state[ddir]);
 	do {
-		r = __rand(&td->bsrange_state);
+		r = __rand(&td->bsrange_state[ddir]);
 
 		if (!td->o.bssplit_nr[ddir]) {
 			buflen = 1 + (unsigned int) ((double) maxbs *
@@ -576,13 +578,11 @@
 			}
 		}
 
-		if (td->o.verify != VERIFY_NONE)
-			buflen = (buflen + td->o.verify_interval - 1) &
-				~(td->o.verify_interval - 1);
-
-		if (!td->o.bs_unaligned && is_power_of_2(minbs))
+		power_2 = is_power_of_2(minbs);
+		if (!td->o.bs_unaligned && power_2)
 			buflen &= ~(minbs - 1);
-
+		else if (!td->o.bs_unaligned && !power_2) 
+			buflen -= buflen % minbs; 
 	} while (!io_u_fits(td, io_u, buflen));
 
 	return buflen;
@@ -646,7 +646,7 @@
 	}
 
 	while (td->io_u_in_flight) {
-		int fio_unused ret;
+		int ret;
 
 		ret = io_u_queued_complete(td, 1);
 		if (ret > 0)
@@ -717,28 +717,22 @@
 	enum fio_ddir ddir;
 
 	/*
-	 * see if it's time to fsync
+	 * See if it's time to fsync/fdatasync/sync_file_range first,
+	 * and if not then move on to check regular I/Os.
 	 */
-	if (td->o.fsync_blocks &&
-	   !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) &&
-	     td->io_issues[DDIR_WRITE] && should_fsync(td))
-		return DDIR_SYNC;
-
-	/*
-	 * see if it's time to fdatasync
-	 */
-	if (td->o.fdatasync_blocks &&
-	   !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks) &&
-	     td->io_issues[DDIR_WRITE] && should_fsync(td))
-		return DDIR_DATASYNC;
-
-	/*
-	 * see if it's time to sync_file_range
-	 */
-	if (td->sync_file_range_nr &&
-	   !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr) &&
-	     td->io_issues[DDIR_WRITE] && should_fsync(td))
-		return DDIR_SYNC_FILE_RANGE;
+	if (should_fsync(td)) {
+		if (td->o.fsync_blocks && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks))
+			return DDIR_SYNC;
+
+		if (td->o.fdatasync_blocks && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks))
+			return DDIR_DATASYNC;
+
+		if (td->sync_file_range_nr && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr))
+			return DDIR_SYNC_FILE_RANGE;
+	}
 
 	if (td_rw(td)) {
 		/*
@@ -762,8 +756,10 @@
 		ddir = DDIR_READ;
 	else if (td_write(td))
 		ddir = DDIR_WRITE;
-	else
+	else if (td_trim(td))
 		ddir = DDIR_TRIM;
+	else
+		ddir = DDIR_INVAL;
 
 	td->rwmix_ddir = rate_ddir(td, ddir);
 	return td->rwmix_ddir;
@@ -903,8 +899,9 @@
 	}
 
 	if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
-		dprint(FD_IO, "io_u %p, offset too large\n", io_u);
-		dprint(FD_IO, "  off=%llu/%lu > %llu\n",
+		dprint(FD_IO, "io_u %p, offset + buflen exceeds file size\n",
+			io_u);
+		dprint(FD_IO, "  offset=%llu/buflen=%lu > %llu\n",
 			(unsigned long long) io_u->offset, io_u->buflen,
 			(unsigned long long) io_u->file->real_file_size);
 		return 1;
@@ -992,11 +989,52 @@
 	td->ts.io_u_map[idx] += nr;
 }
 
-static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec)
+static void io_u_mark_lat_nsec(struct thread_data *td, unsigned long long nsec)
+{
+	int idx = 0;
+
+	assert(nsec < 1000);
+
+	switch (nsec) {
+	case 750 ... 999:
+		idx = 9;
+		break;
+	case 500 ... 749:
+		idx = 8;
+		break;
+	case 250 ... 499:
+		idx = 7;
+		break;
+	case 100 ... 249:
+		idx = 6;
+		break;
+	case 50 ... 99:
+		idx = 5;
+		break;
+	case 20 ... 49:
+		idx = 4;
+		break;
+	case 10 ... 19:
+		idx = 3;
+		break;
+	case 4 ... 9:
+		idx = 2;
+		break;
+	case 2 ... 3:
+		idx = 1;
+	case 0 ... 1:
+		break;
+	}
+
+	assert(idx < FIO_IO_U_LAT_N_NR);
+	td->ts.io_u_lat_n[idx]++;
+}
+
+static void io_u_mark_lat_usec(struct thread_data *td, unsigned long long usec)
 {
 	int idx = 0;
 
-	assert(usec < 1000);
+	assert(usec < 1000 && usec >= 1);
 
 	switch (usec) {
 	case 750 ... 999:
@@ -1033,10 +1071,12 @@
 	td->ts.io_u_lat_u[idx]++;
 }
 
-static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec)
+static void io_u_mark_lat_msec(struct thread_data *td, unsigned long long msec)
 {
 	int idx = 0;
 
+	assert(msec >= 1);
+
 	switch (msec) {
 	default:
 		idx = 11;
@@ -1078,12 +1118,14 @@
 	td->ts.io_u_lat_m[idx]++;
 }
 
-static void io_u_mark_latency(struct thread_data *td, unsigned long usec)
+static void io_u_mark_latency(struct thread_data *td, unsigned long long nsec)
 {
-	if (usec < 1000)
-		io_u_mark_lat_usec(td, usec);
+	if (nsec < 1000)
+		io_u_mark_lat_nsec(td, nsec);
+	else if (nsec < 1000000)
+		io_u_mark_lat_usec(td, nsec / 1000);
 	else
-		io_u_mark_lat_msec(td, usec / 1000);
+		io_u_mark_lat_msec(td, nsec / 1000000);
 }
 
 static unsigned int __get_next_fileno_rand(struct thread_data *td)
@@ -1560,7 +1602,7 @@
 	unsigned int i, nr_blocks = io_u->buflen / 512;
 	uint64_t boffset;
 	unsigned int offset;
-	void *p, *end;
+	char *p, *end;
 
 	if (!nr_blocks)
 		return;
@@ -1575,7 +1617,7 @@
 		 * the buffer, given by the product of the usec time
 		 * and the actual offset.
 		 */
-		offset = (io_u->start_time.tv_usec ^ boffset) & 511;
+		offset = ((io_u->start_time.tv_nsec/1000) ^ boffset) & 511;
 		offset &= ~(sizeof(uint64_t) - 1);
 		if (offset >= 512 - sizeof(uint64_t))
 			offset -= sizeof(uint64_t);
@@ -1677,8 +1719,10 @@
 	if (!td_io_prep(td, io_u)) {
 		if (!td->o.disable_lat)
 			fio_gettime(&io_u->start_time, NULL);
+
 		if (do_scramble)
 			small_content_scramble(io_u);
+
 		return io_u;
 	}
 err_put:
@@ -1730,43 +1774,46 @@
 				  const enum fio_ddir idx, unsigned int bytes)
 {
 	const int no_reduce = !gtod_reduce(td);
-	unsigned long lusec = 0;
+	unsigned long long llnsec = 0;
 
 	if (td->parent)
 		td = td->parent;
 
+	if (!td->o.stats)
+		return;
+
 	if (no_reduce)
-		lusec = utime_since(&io_u->issue_time, &icd->time);
+		llnsec = ntime_since(&io_u->issue_time, &icd->time);
 
 	if (!td->o.disable_lat) {
-		unsigned long tusec;
+		unsigned long long tnsec;
 
-		tusec = utime_since(&io_u->start_time, &icd->time);
-		add_lat_sample(td, idx, tusec, bytes, io_u->offset);
+		tnsec = ntime_since(&io_u->start_time, &icd->time);
+		add_lat_sample(td, idx, tnsec, bytes, io_u->offset);
 
 		if (td->flags & TD_F_PROFILE_OPS) {
 			struct prof_io_ops *ops = &td->prof_io_ops;
 
 			if (ops->io_u_lat)
-				icd->error = ops->io_u_lat(td, tusec);
+				icd->error = ops->io_u_lat(td, tnsec/1000);
 		}
 
-		if (td->o.max_latency && tusec > td->o.max_latency)
-			lat_fatal(td, icd, tusec, td->o.max_latency);
-		if (td->o.latency_target && tusec > td->o.latency_target) {
+		if (td->o.max_latency && tnsec/1000 > td->o.max_latency)
+			lat_fatal(td, icd, tnsec/1000, td->o.max_latency);
+		if (td->o.latency_target && tnsec/1000 > td->o.latency_target) {
 			if (lat_target_failed(td))
-				lat_fatal(td, icd, tusec, td->o.latency_target);
+				lat_fatal(td, icd, tnsec/1000, td->o.latency_target);
 		}
 	}
 
 	if (ddir_rw(idx)) {
 		if (!td->o.disable_clat) {
-			add_clat_sample(td, idx, lusec, bytes, io_u->offset);
-			io_u_mark_latency(td, lusec);
+			add_clat_sample(td, idx, llnsec, bytes, io_u->offset);
+			io_u_mark_latency(td, llnsec);
 		}
 
 		if (!td->o.disable_bw && per_unit_log(td->bw_log))
-			add_bw_sample(td, io_u, bytes, lusec);
+			add_bw_sample(td, io_u, bytes, llnsec);
 
 		if (no_reduce && per_unit_log(td->iops_log))
 			add_iops_sample(td, io_u, bytes);
@@ -1906,7 +1953,7 @@
 	icd->nr = nr;
 
 	icd->error = 0;
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
 		icd->bytes_done[ddir] = 0;
 }
 
@@ -1945,7 +1992,7 @@
 		return -1;
 	}
 
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
 		td->bytes_done[ddir] += icd.bytes_done[ddir];
 
 	return 0;
@@ -1961,7 +2008,7 @@
 	int ret, ddir;
 	struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
 
-	dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_evts);
+	dprint(FD_IO, "io_u_queued_complete: min=%d\n", min_evts);
 
 	if (!min_evts)
 		tvp = &ts;
@@ -1984,7 +2031,7 @@
 		return -1;
 	}
 
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
 		td->bytes_done[ddir] += icd.bytes_done[ddir];
 
 	return ret;
@@ -1995,10 +2042,10 @@
  */
 void io_u_queued(struct thread_data *td, struct io_u *io_u)
 {
-	if (!td->o.disable_slat) {
+	if (!td->o.disable_slat && ramp_time_over(td) && td->o.stats) {
 		unsigned long slat_time;
 
-		slat_time = utime_since(&io_u->start_time, &io_u->issue_time);
+		slat_time = ntime_since(&io_u->start_time, &io_u->issue_time);
 
 		if (td->parent)
 			td = td->parent;
@@ -2043,6 +2090,9 @@
 {
 	struct thread_options *o = &td->o;
 
+	if (o->mem_type == MEM_CUDA_MALLOC)
+		return;
+
 	if (o->compress_percentage || o->dedupe_percentage) {
 		unsigned int perc = td->o.compress_percentage;
 		struct frand_state *rs;
@@ -2088,3 +2138,61 @@
 	io_u->buf_filled_len = 0;
 	fill_io_buffer(td, io_u->buf, min_write, max_bs);
 }
+
+static int do_sync_file_range(const struct thread_data *td,
+			      struct fio_file *f)
+{
+	off64_t offset, nbytes;
+
+	offset = f->first_write;
+	nbytes = f->last_write - f->first_write;
+
+	if (!nbytes)
+		return 0;
+
+	return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range);
+}
+
+int do_io_u_sync(const struct thread_data *td, struct io_u *io_u)
+{
+	int ret;
+
+	if (io_u->ddir == DDIR_SYNC) {
+		ret = fsync(io_u->file->fd);
+	} else if (io_u->ddir == DDIR_DATASYNC) {
+#ifdef CONFIG_FDATASYNC
+		ret = fdatasync(io_u->file->fd);
+#else
+		ret = io_u->xfer_buflen;
+		io_u->error = EINVAL;
+#endif
+	} else if (io_u->ddir == DDIR_SYNC_FILE_RANGE)
+		ret = do_sync_file_range(td, io_u->file);
+	else {
+		ret = io_u->xfer_buflen;
+		io_u->error = EINVAL;
+	}
+
+	if (ret < 0)
+		io_u->error = errno;
+
+	return ret;
+}
+
+int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
+{
+#ifndef FIO_HAVE_TRIM
+	io_u->error = EINVAL;
+	return 0;
+#else
+	struct fio_file *f = io_u->file;
+	int ret;
+
+	ret = os_trim(f, io_u->offset, io_u->xfer_buflen);
+	if (!ret)
+		return io_u->xfer_buflen;
+
+	io_u->error = ret;
+	return 0;
+#endif
+}
diff -Nru fio-2.16/io_u.h fio-3.1/io_u.h
--- fio-2.16/io_u.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/io_u.h	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,179 @@
+#ifndef FIO_IO_U
+#define FIO_IO_U
+
+#include "compiler/compiler.h"
+#include "os/os.h"
+#include "log.h"
+#include "io_ddir.h"
+#include "debug.h"
+#include "file.h"
+#include "workqueue.h"
+
+#ifdef CONFIG_LIBAIO
+#include <libaio.h>
+#endif
+#ifdef CONFIG_GUASI
+#include <guasi.h>
+#endif
+
+enum {
+	IO_U_F_FREE		= 1 << 0,
+	IO_U_F_FLIGHT		= 1 << 1,
+	IO_U_F_NO_FILE_PUT	= 1 << 2,
+	IO_U_F_IN_CUR_DEPTH	= 1 << 3,
+	IO_U_F_BUSY_OK		= 1 << 4,
+	IO_U_F_TRIMMED		= 1 << 5,
+	IO_U_F_BARRIER		= 1 << 6,
+	IO_U_F_VER_LIST		= 1 << 7,
+};
+
+/*
+ * The io unit
+ */
+struct io_u {
+	struct timespec start_time;
+	struct timespec issue_time;
+
+	struct fio_file *file;
+	unsigned int flags;
+	enum fio_ddir ddir;
+
+	/*
+	 * For replay workloads, we may want to account as a different
+	 * IO type than what is being submitted.
+	 */
+	enum fio_ddir acct_ddir;
+
+	/*
+	 * Write generation
+	 */
+	unsigned short numberio;
+
+	/*
+	 * Allocated/set buffer and length
+	 */
+	unsigned long buflen;
+	unsigned long long offset;
+	void *buf;
+
+	/*
+	 * Initial seed for generating the buffer contents
+	 */
+	uint64_t rand_seed;
+
+	/*
+	 * IO engine state, may be different from above when we get
+	 * partial transfers / residual data counts
+	 */
+	void *xfer_buf;
+	unsigned long xfer_buflen;
+
+	/*
+	 * Parameter related to pre-filled buffers and
+	 * their size to handle variable block sizes.
+	 */
+	unsigned long buf_filled_len;
+
+	struct io_piece *ipo;
+
+	unsigned int resid;
+	unsigned int error;
+
+	/*
+	 * io engine private data
+	 */
+	union {
+		unsigned int index;
+		unsigned int seen;
+		void *engine_data;
+	};
+
+	union {
+		struct flist_head verify_list;
+		struct workqueue_work work;
+	};
+
+	/*
+	 * Callback for io completion
+	 */
+	int (*end_io)(struct thread_data *, struct io_u **);
+
+	union {
+#ifdef CONFIG_LIBAIO
+		struct iocb iocb;
+#endif
+#ifdef CONFIG_POSIXAIO
+		os_aiocb_t aiocb;
+#endif
+#ifdef FIO_HAVE_SGIO
+		struct sg_io_hdr hdr;
+#endif
+#ifdef CONFIG_GUASI
+		guasi_req_t greq;
+#endif
+#ifdef CONFIG_SOLARISAIO
+		aio_result_t resultp;
+#endif
+#ifdef FIO_HAVE_BINJECT
+		struct b_user_cmd buc;
+#endif
+#ifdef CONFIG_RDMA
+		struct ibv_mr *mr;
+#endif
+		void *mmap_data;
+	};
+};
+
+/*
+ * io unit handling
+ */
+extern struct io_u *__get_io_u(struct thread_data *);
+extern struct io_u *get_io_u(struct thread_data *);
+extern void put_io_u(struct thread_data *, struct io_u *);
+extern void clear_io_u(struct thread_data *, struct io_u *);
+extern void requeue_io_u(struct thread_data *, struct io_u **);
+extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *);
+extern int __must_check io_u_queued_complete(struct thread_data *, int);
+extern void io_u_queued(struct thread_data *, struct io_u *);
+extern int io_u_quiesce(struct thread_data *);
+extern void io_u_log_error(struct thread_data *, struct io_u *);
+extern void io_u_mark_depth(struct thread_data *, unsigned int);
+extern void fill_io_buffer(struct thread_data *, void *, unsigned int, unsigned int);
+extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int);
+void io_u_mark_complete(struct thread_data *, unsigned int);
+void io_u_mark_submit(struct thread_data *, unsigned int);
+bool queue_full(const struct thread_data *);
+
+int do_io_u_sync(const struct thread_data *, struct io_u *);
+int do_io_u_trim(const struct thread_data *, struct io_u *);
+
+#ifdef FIO_INC_DEBUG
+static inline void dprint_io_u(struct io_u *io_u, const char *p)
+{
+	struct fio_file *f = io_u->file;
+
+	dprint(FD_IO, "%s: io_u %p: off=%llu/len=%lu/ddir=%d", p, io_u,
+					(unsigned long long) io_u->offset,
+					io_u->buflen, io_u->ddir);
+	if (f)
+		dprint(FD_IO, "/%s", f->file_name);
+	dprint(FD_IO, "\n");
+}
+#else
+#define dprint_io_u(io_u, p)
+#endif
+
+static inline enum fio_ddir acct_ddir(struct io_u *io_u)
+{
+	if (io_u->acct_ddir != -1)
+		return io_u->acct_ddir;
+
+	return io_u->ddir;
+}
+
+#define io_u_clear(td, io_u, val)	\
+	td_flags_clear((td), &(io_u->flags), (val))
+#define io_u_set(td, io_u, val)		\
+	td_flags_set((td), &(io_u)->flags, (val))
+
+#endif
diff -Nru fio-2.16/lib/axmap.c fio-3.1/lib/axmap.c
--- fio-2.16/lib/axmap.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/axmap.c	2017-09-28 10:23:20.000000000 +0000
@@ -184,6 +184,9 @@
 void axmap_clear(struct axmap *axmap, uint64_t bit_nr)
 {
 	axmap_handler(axmap, bit_nr, axmap_clear_fn, NULL);
+
+	if (bit_nr < axmap->first_free)
+		axmap->first_free = bit_nr;
 }
 
 struct axmap_set_data {
@@ -191,7 +194,7 @@
 	unsigned int set_bits;
 };
 
-static unsigned long bit_masks[] = {
+static const unsigned long bit_masks[] = {
 	0x0000000000000000, 0x0000000000000001, 0x0000000000000003, 0x0000000000000007,
 	0x000000000000000f, 0x000000000000001f, 0x000000000000003f, 0x000000000000007f,
 	0x00000000000000ff, 0x00000000000001ff, 0x00000000000003ff, 0x00000000000007ff,
@@ -372,10 +375,9 @@
 
 static uint64_t axmap_first_free(struct axmap *axmap)
 {
-	if (firstfree_valid(axmap))
-		return axmap->first_free;
+	if (!firstfree_valid(axmap))
+		axmap->first_free = axmap_find_first_free(axmap, axmap->nr_levels - 1, 0);
 
-	axmap->first_free = axmap_find_first_free(axmap, axmap->nr_levels - 1, 0);
 	return axmap->first_free;
 }
 
diff -Nru fio-2.16/lib/bloom.c fio-3.1/lib/bloom.c
--- fio-2.16/lib/bloom.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/bloom.c	2017-09-28 10:23:20.000000000 +0000
@@ -65,6 +65,7 @@
 	struct bloom *b;
 	size_t no_uints;
 
+	crc32c_arm64_probe();
 	crc32c_intel_probe();
 
 	b = malloc(sizeof(*b));
@@ -103,8 +104,10 @@
 
 		if (b->map[index] & (1U << bit))
 			was_set++;
-		if (set)
+		else if (set)
 			b->map[index] |= 1U << bit;
+		else
+			break;
 	}
 
 	return was_set == N_HASHES;
diff -Nru fio-2.16/lib/ffz.h fio-3.1/lib/ffz.h
--- fio-2.16/lib/ffz.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/ffz.h	2017-09-28 10:23:20.000000000 +0000
@@ -27,10 +27,8 @@
 		word >>= 2;
 		r += 2;
 	}
-	if (!(word & 1)) {
-		word >>= 1;
+	if (!(word & 1))
 		r += 1;
-	}
 
 	return r;
 }
diff -Nru fio-2.16/lib/memalign.c fio-3.1/lib/memalign.c
--- fio-2.16/lib/memalign.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/memalign.c	2017-09-28 10:23:20.000000000 +0000
@@ -4,13 +4,13 @@
 
 #include "memalign.h"
 
+#define PTR_ALIGN(ptr, mask)   \
+	(char *)((uintptr_t)((ptr) + (mask)) & ~(mask))
+
 struct align_footer {
 	unsigned int offset;
 };
 
-#define PTR_ALIGN(ptr, mask)	\
-	(char *) (((uintptr_t) ((ptr) + (mask)) & ~(mask)))
-
 void *fio_memalign(size_t alignment, size_t size)
 {
 	struct align_footer *f;
@@ -18,7 +18,7 @@
 
 	assert(!(alignment & (alignment - 1)));
 
-	ptr = malloc(size + alignment + size + sizeof(*f) - 1);
+	ptr = malloc(size + alignment + sizeof(*f) - 1);
 	if (ptr) {
 		ret = PTR_ALIGN(ptr, alignment - 1);
 		f = ret + size;
diff -Nru fio-2.16/lib/mountcheck.c fio-3.1/lib/mountcheck.c
--- fio-2.16/lib/mountcheck.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/mountcheck.c	2017-09-28 10:23:20.000000000 +0000
@@ -4,7 +4,7 @@
 #ifdef CONFIG_GETMNTENT
 #include <mntent.h>
 
-#include "lib/mountcheck.h"
+#include "mountcheck.h"
 
 #define MTAB	"/etc/mtab"
 
diff -Nru fio-2.16/lib/num2str.c fio-3.1/lib/num2str.c
--- fio-2.16/lib/num2str.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/num2str.c	2017-09-28 10:23:20.000000000 +0000
@@ -2,40 +2,71 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "../fio.h"
+#include "../compiler/compiler.h"
+#include "num2str.h"
 
-#define ARRAY_LENGTH(arr)	sizeof(arr) / sizeof((arr)[0])
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
 
-/*
- * Cheesy number->string conversion, complete with carry rounding error.
+/**
+ * num2str() - Cheesy number->string conversion, complete with carry rounding error.
+ * @num: quantity (e.g., number of blocks, bytes or bits)
+ * @maxlen: max number of digits in the output string (not counting prefix and units, but counting .)
+ * @base: multiplier for num (e.g., if num represents Ki, use 1024)
+ * @pow2: select unit prefix - 0=power-of-10 decimal SI, nonzero=power-of-2 binary IEC
+ * @units: select units - N2S_* macros defined in num2str.h
+ * @returns a malloc'd buffer containing "number[<unit prefix>][<units>]"
  */
-char *num2str(uint64_t num, int maxlen, int base, int pow2, int unit_base)
+char *num2str(uint64_t num, int maxlen, int base, int pow2, int units)
 {
-	const char *postfix[] = { "", "K", "M", "G", "P", "E" };
-	const char *byte_postfix[] = { "", "B", "bit" };
+	const char *sistr[] = { "", "k", "M", "G", "T", "P" };
+	const char *iecstr[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi" };
+	const char **unitprefix;
+	const char *unitstr[] = { "", "/s", "B", "bit", "B/s", "bit/s" };
 	const unsigned int thousand[] = { 1000, 1024 };
-	unsigned int modulo, decimals;
-	int byte_post_index = 0, post_index, carry = 0;
-	char tmp[32];
+	unsigned int modulo;
+	int unit_index = 0, post_index, carry = 0;
+	char tmp[32], fmt[32];
 	char *buf;
 
+	compiletime_assert(sizeof(sistr) == sizeof(iecstr), "unit prefix arrays must be identical sizes");
+
 	buf = malloc(128);
+	if (!buf)
+		return NULL;
+
+	if (pow2)
+		unitprefix = iecstr;
+	else
+		unitprefix = sistr;
 
 	for (post_index = 0; base > 1; post_index++)
 		base /= thousand[!!pow2];
 
-	switch (unit_base) {
-	case 1:
-		byte_post_index = 2;
+	switch (units) {
+	case N2S_PERSEC:
+		unit_index = 1;
+		break;
+	case N2S_BYTE:
+		unit_index = 2;
+		break;
+	case N2S_BIT:
+		unit_index = 3;
 		num *= 8;
 		break;
-	case 8:
-		byte_post_index = 1;
+	case N2S_BYTEPERSEC:
+		unit_index = 4;
+		break;
+	case N2S_BITPERSEC:
+		unit_index = 5;
+		num *= 8;
 		break;
 	}
 
+	/*
+	 * Divide by K/Ki until string length of num <= maxlen.
+	 */
 	modulo = -1U;
-	while (post_index < sizeof(postfix)) {
+	while (post_index < sizeof(sistr)) {
 		sprintf(tmp, "%llu", (unsigned long long) num);
 		if (strlen(tmp) <= maxlen)
 			break;
@@ -46,33 +77,38 @@
 		post_index++;
 	}
 
+	/*
+	 * If no modulo, then we're done.
+	 */
 	if (modulo == -1U) {
 done:
-		if (post_index >= ARRAY_LENGTH(postfix))
+		if (post_index >= ARRAY_SIZE(sistr))
 			post_index = 0;
 
 		sprintf(buf, "%llu%s%s", (unsigned long long) num,
-			postfix[post_index], byte_postfix[byte_post_index]);
+			unitprefix[post_index], unitstr[unit_index]);
 		return buf;
 	}
 
+	/*
+	 * If no room for decimals, then we're done.
+	 */
 	sprintf(tmp, "%llu", (unsigned long long) num);
-	decimals = maxlen - strlen(tmp);
-	if (decimals <= 1) {
+	if ((int)(maxlen - strlen(tmp)) <= 1) {
 		if (carry)
 			num++;
 		goto done;
 	}
 
-	do {
-		sprintf(tmp, "%u", modulo);
-		if (strlen(tmp) <= decimals - 1)
-			break;
-
-		modulo = (modulo + 9) / 10;
-	} while (1);
+	/*
+	 * Fill in everything and return the result.
+	 */
+	assert(maxlen - strlen(tmp) - 1 > 0);
+	assert(modulo < thousand[!!pow2]);
+	sprintf(fmt, "%%.%df", (int)(maxlen - strlen(tmp) - 1));
+	sprintf(tmp, fmt, (double)modulo / (double)thousand[!!pow2]);
 
-	sprintf(buf, "%llu.%u%s%s", (unsigned long long) num, modulo,
-			postfix[post_index], byte_postfix[byte_post_index]);
+	sprintf(buf, "%llu.%s%s%s", (unsigned long long) num, &tmp[2],
+			unitprefix[post_index], unitstr[unit_index]);
 	return buf;
 }
diff -Nru fio-2.16/lib/num2str.h fio-3.1/lib/num2str.h
--- fio-2.16/lib/num2str.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/lib/num2str.h	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,15 @@
+#ifndef FIO_NUM2STR_H
+#define FIO_NUM2STR_H
+
+#include <inttypes.h>
+
+#define N2S_NONE	0
+#define N2S_BITPERSEC	1	/* match unit_base for bit rates */
+#define N2S_PERSEC	2
+#define N2S_BIT		3
+#define N2S_BYTE	4
+#define N2S_BYTEPERSEC	8	/* match unit_base for byte rates */
+
+extern char *num2str(uint64_t, int, int, int, int);
+
+#endif
diff -Nru fio-2.16/lib/output_buffer.c fio-3.1/lib/output_buffer.c
--- fio-2.16/lib/output_buffer.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/output_buffer.c	2017-09-28 10:23:20.000000000 +0000
@@ -3,7 +3,6 @@
 #include <stdlib.h>
 
 #include "output_buffer.h"
-#include "../log.h"
 #include "../minmax.h"
 
 #define BUF_INC	1024
@@ -18,6 +17,7 @@
 void buf_output_free(struct buf_output *out)
 {
 	free(out->buf);
+	buf_output_init(out);
 }
 
 size_t buf_output_add(struct buf_output *out, const char *buf, size_t len)
@@ -40,16 +40,3 @@
 	out->buflen += len;
 	return len;
 }
-
-size_t buf_output_flush(struct buf_output *out)
-{
-	size_t ret = 0;
-
-	if (out->buflen) {
-		ret = log_info_buf(out->buf, out->buflen);
-		memset(out->buf, 0, out->max_buflen);
-		out->buflen = 0;
-	}
-
-	return ret;
-}
diff -Nru fio-2.16/lib/output_buffer.h fio-3.1/lib/output_buffer.h
--- fio-2.16/lib/output_buffer.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/output_buffer.h	2017-09-28 10:23:20.000000000 +0000
@@ -12,6 +12,5 @@
 void buf_output_init(struct buf_output *out);
 void buf_output_free(struct buf_output *out);
 size_t buf_output_add(struct buf_output *out, const char *buf, size_t len);
-size_t buf_output_flush(struct buf_output *out);
 
 #endif
diff -Nru fio-2.16/lib/pattern.c fio-3.1/lib/pattern.c
--- fio-2.16/lib/pattern.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/pattern.c	2017-09-28 10:23:20.000000000 +0000
@@ -1,7 +1,77 @@
-#include "fio.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
 #include "strntol.h"
 #include "pattern.h"
+#include "../minmax.h"
 #include "../oslib/strcasestr.h"
+#include "../oslib/strndup.h"
+
+/**
+ * parse_file() - parses binary file to fill buffer
+ * @beg - string input, extract filename from this
+ * @out - output buffer where parsed number should be put
+ * @out_len - length of the output buffer
+ * @filled - pointer where number of bytes successfully
+ *           parsed will be put
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_file(const char *beg, char *out,
+			      unsigned int out_len,
+			      unsigned int *filled)
+{
+	const char *end;
+	char *file;
+	int fd;
+	ssize_t count;
+
+	if (!out_len)
+		goto err_out;
+
+	assert(*beg == '\'');
+	beg++;
+	end = strchr(beg, '\'');
+	if (!end)
+		goto err_out;
+
+	file = strndup(beg, end - beg);
+	if (file == NULL)
+		goto err_out;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0)
+		goto err_free_out;
+
+	count = read(fd, out, out_len);
+	if (count == -1)
+		goto err_free_close_out;
+
+	*filled = count;
+	close(fd);
+	free(file);
+
+	/* Catch up quote */
+	return end + 1;
+
+err_free_close_out:
+	close(fd);
+err_free_out:
+	free(file);
+err_out:
+	return NULL;
+
+}
 
 /**
  * parse_string() - parses string in double quotes, like "abc"
@@ -264,6 +334,9 @@
 		parsed_fmt = 0;
 
 		switch (*beg) {
+		case '\'':
+			end = parse_file(beg, out, out_len, &filled);
+			break;
 		case '"':
 			end = parse_string(beg, out, out_len, &filled);
 			break;
diff -Nru fio-2.16/lib/pow2.h fio-3.1/lib/pow2.h
--- fio-2.16/lib/pow2.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/pow2.h	2017-09-28 10:23:20.000000000 +0000
@@ -2,8 +2,9 @@
 #define FIO_POW2_H
 
 #include <inttypes.h>
+#include "types.h"
 
-static inline int is_power_of_2(uint64_t val)
+static inline bool is_power_of_2(uint64_t val)
 {
 	return (val != 0 && ((val & (val - 1)) == 0));
 }
diff -Nru fio-2.16/lib/prio_tree.c fio-3.1/lib/prio_tree.c
--- fio-2.16/lib/prio_tree.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/prio_tree.c	2017-09-28 10:23:20.000000000 +0000
@@ -13,9 +13,12 @@
 
 #include <stdlib.h>
 #include <limits.h>
-#include "../fio.h"
+
+#include "../compiler/compiler.h"
 #include "prio_tree.h"
 
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
+
 /*
  * A clever mix of heap and radix trees forms a radix priority search tree (PST)
  * which is useful for storing intervals, e.g, we can consider a vma as a closed
diff -Nru fio-2.16/lib/rand.c fio-3.1/lib/rand.c
--- fio-2.16/lib/rand.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/rand.c	2017-09-28 10:23:20.000000000 +0000
@@ -36,7 +36,7 @@
 #include <string.h>
 #include <assert.h>
 #include "rand.h"
-#include "lib/pattern.h"
+#include "pattern.h"
 #include "../hash.h"
 
 int arch_random;
diff -Nru fio-2.16/lib/seqlock.h fio-3.1/lib/seqlock.h
--- fio-2.16/lib/seqlock.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/lib/seqlock.h	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,49 @@
+#ifndef FIO_SEQLOCK_H
+#define FIO_SEQLOCK_H
+
+#include "types.h"
+#include "../arch/arch.h"
+
+struct seqlock {
+	volatile int sequence;
+};
+
+static inline void seqlock_init(struct seqlock *s)
+{
+	s->sequence = 0;
+}
+
+static inline unsigned int read_seqlock_begin(struct seqlock *s)
+{
+	unsigned int seq;
+
+	do {
+		seq = s->sequence;
+		if (!(seq & 1))
+			break;
+		nop;
+	} while (1);
+
+	read_barrier();
+	return seq;
+}
+
+static inline bool read_seqlock_retry(struct seqlock *s, unsigned int seq)
+{
+	read_barrier();
+	return s->sequence != seq;
+}
+
+static inline void write_seqlock_begin(struct seqlock *s)
+{
+	s->sequence++;
+	write_barrier();
+}
+
+static inline void write_seqlock_end(struct seqlock *s)
+{
+	write_barrier();
+	s->sequence++;
+}
+
+#endif
diff -Nru fio-2.16/lib/strntol.c fio-3.1/lib/strntol.c
--- fio-2.16/lib/strntol.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/strntol.c	2017-09-28 10:23:20.000000000 +0000
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <limits.h>
 
-#include "lib/strntol.h"
+#include "strntol.h"
 
 long strntol(const char *str, size_t sz, char **end, int base)
 {
diff -Nru fio-2.16/lib/zipf.c fio-3.1/lib/zipf.c
--- fio-2.16/lib/zipf.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/lib/zipf.c	2017-09-28 10:23:20.000000000 +0000
@@ -6,7 +6,6 @@
 #include <sys/types.h>
 #include <fcntl.h>
 #include "ieee754.h"
-#include "../log.h"
 #include "zipf.h"
 #include "../minmax.h"
 #include "../hash.h"
diff -Nru fio-2.16/libfio.c fio-3.1/libfio.c
--- fio-2.16/libfio.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/libfio.c	2017-09-28 10:23:20.000000000 +0000
@@ -36,12 +36,7 @@
 #include "helper_thread.h"
 #include "filehash.h"
 
-/*
- * Just expose an empty list, if the OS does not support disk util stats
- */
-#ifndef FIO_HAVE_DISK_UTIL
 FLIST_HEAD(disk_list);
-#endif
 
 unsigned long arch_flags = 0;
 
@@ -149,10 +144,10 @@
 	}
 
 	set_epoch_time(td, td->o.log_unix_epoch);
-	memcpy(&td->start, &td->epoch, sizeof(struct timeval));
-	memcpy(&td->iops_sample_time, &td->epoch, sizeof(struct timeval));
-	memcpy(&td->bw_sample_time, &td->epoch, sizeof(struct timeval));
-	memcpy(&td->ss.prev_time, &td->epoch, sizeof(struct timeval));
+	memcpy(&td->start, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
 
 	lat_target_reset(td);
 	clear_rusage_stat(td);
@@ -281,7 +276,7 @@
 	int nr_io_threads = 0;
 
 	for_each_td(td, i) {
-		if (td->flags & TD_F_NOIO)
+		if (td->io_ops_init && td_ioengine_flagged(td, FIO_NOIO))
 			continue;
 		nr_io_threads++;
 		if (td->runstate < TD_EXITED)
@@ -311,6 +306,13 @@
 	return flags;
 }
 
+enum {
+	ENDIAN_INVALID_BE = 1,
+	ENDIAN_INVALID_LE,
+	ENDIAN_INVALID_CONFIG,
+	ENDIAN_BROKEN,
+};
+
 static int endian_check(void)
 {
 	union {
@@ -327,16 +329,16 @@
 
 #if defined(CONFIG_LITTLE_ENDIAN)
 	if (be)
-		return 1;
+		return ENDIAN_INVALID_BE;
 #elif defined(CONFIG_BIG_ENDIAN)
 	if (le)
-		return 1;
+		return ENDIAN_INVALID_LE;
 #else
-	return 1;
+	return ENDIAN_INVALID_CONFIG;
 #endif
 
 	if (!le && !be)
-		return 1;
+		return ENDIAN_BROKEN;
 
 	return 0;
 }
@@ -344,23 +346,45 @@
 int initialize_fio(char *envp[])
 {
 	long ps;
+	int err;
 
 	/*
 	 * We need these to be properly 64-bit aligned, otherwise we
 	 * can run into problems on archs that fault on unaligned fp
 	 * access (ARM).
 	 */
+	compiletime_assert((offsetof(struct thread_data, ts) % sizeof(void *)) == 0, "ts");
 	compiletime_assert((offsetof(struct thread_stat, percentile_list) % 8) == 0, "stat percentile_list");
 	compiletime_assert((offsetof(struct thread_stat, total_run_time) % 8) == 0, "total_run_time");
 	compiletime_assert((offsetof(struct thread_stat, total_err_count) % 8) == 0, "total_err_count");
 	compiletime_assert((offsetof(struct thread_stat, latency_percentile) % 8) == 0, "stat latency_percentile");
+	compiletime_assert((offsetof(struct thread_data, ts.clat_stat) % 8) == 0, "ts.clat_stat");
 	compiletime_assert((offsetof(struct thread_options_pack, zipf_theta) % 8) == 0, "zipf_theta");
 	compiletime_assert((offsetof(struct thread_options_pack, pareto_h) % 8) == 0, "pareto_h");
 	compiletime_assert((offsetof(struct thread_options_pack, percentile_list) % 8) == 0, "percentile_list");
 	compiletime_assert((offsetof(struct thread_options_pack, latency_percentile) % 8) == 0, "latency_percentile");
+	compiletime_assert((offsetof(struct jobs_eta, m_rate) % 8) == 0, "m_rate");
 
-	if (endian_check()) {
+	err = endian_check();
+	if (err) {
 		log_err("fio: endianness settings appear wrong.\n");
+		switch (err) {
+		case ENDIAN_INVALID_BE:
+			log_err("fio: got big-endian when configured for little\n");
+			break;
+		case ENDIAN_INVALID_LE:
+			log_err("fio: got little-endian when configured for big\n");
+			break;
+		case ENDIAN_INVALID_CONFIG:
+			log_err("fio: not configured to any endianness\n");
+			break;
+		case ENDIAN_BROKEN:
+			log_err("fio: failed to detect endianness\n");
+			break;
+		default:
+			assert(0);
+			break;
+		}
 		log_err("fio: please report this to fio@vger.kernel.org\n");
 		return 1;
 	}
diff -Nru fio-2.16/log.c fio-3.1/log.c
--- fio-2.16/log.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/log.c	2017-09-28 10:23:20.000000000 +0000
@@ -6,8 +6,16 @@
 
 #include "fio.h"
 
+#define LOG_START_SZ		512
+
 size_t log_info_buf(const char *buf, size_t len)
 {
+	/*
+	 * buf could be NULL (not just "").
+	 */
+	if (!buf)
+		return 0;
+
 	if (is_backend) {
 		size_t ret = fio_server_text_output(FIO_LOG_INFO, buf, len);
 		if (ret != -1)
@@ -21,40 +29,66 @@
 		return fwrite(buf, len, 1, f_out);
 }
 
-size_t log_valist(const char *str, va_list args)
+static size_t valist_to_buf(char **buffer, const char *fmt, va_list src_args)
 {
-	char buffer[1024];
+	size_t len, cur = LOG_START_SZ;
+	va_list args;
+
+	do {
+		*buffer = calloc(1, cur);
+
+		va_copy(args, src_args);
+		len = vsnprintf(*buffer, cur, fmt, args);
+		va_end(args);
+
+		if (len < cur)
+			break;
+
+		cur = len + 1;
+		free(*buffer);
+	} while (1);
+
+	return len;
+}
+
+size_t log_valist(const char *fmt, va_list args)
+{
+	char *buffer;
 	size_t len;
 
-	len = vsnprintf(buffer, sizeof(buffer), str, args);
+	len = valist_to_buf(&buffer, fmt, args);
+	len = log_info_buf(buffer, len);
+	free(buffer);
 
-	return log_info_buf(buffer, min(len, sizeof(buffer) - 1));
+	return len;
 }
 
 size_t log_info(const char *format, ...)
 {
-	char buffer[1024];
 	va_list args;
-	size_t len;
+	size_t ret;
 
 	va_start(args, format);
-	len = vsnprintf(buffer, sizeof(buffer), format, args);
+	ret = log_valist(format, args);
 	va_end(args);
 
-	return log_info_buf(buffer, min(len, sizeof(buffer) - 1));
+	return ret;
 }
 
 size_t __log_buf(struct buf_output *buf, const char *format, ...)
 {
-	char buffer[1024];
+	char *buffer;
 	va_list args;
 	size_t len;
 
 	va_start(args, format);
-	len = vsnprintf(buffer, sizeof(buffer), format, args);
+	len = valist_to_buf(&buffer, format, args);
 	va_end(args);
 
-	return buf_output_add(buf, buffer, min(len, sizeof(buffer) - 1));
+	len = buf_output_add(buf, buffer, len);
+	free(buffer);
+
+	return len;
 }
 
 int log_info_flush(void)
@@ -67,33 +101,33 @@
 
 size_t log_err(const char *format, ...)
 {
-	char buffer[1024];
+	size_t ret, len;
+	char *buffer;
 	va_list args;
-	size_t len;
 
 	va_start(args, format);
-	len = vsnprintf(buffer, sizeof(buffer), format, args);
+	len = valist_to_buf(&buffer, format, args);
 	va_end(args);
-	len = min(len, sizeof(buffer) - 1);
 
 	if (is_backend) {
-		size_t ret = fio_server_text_output(FIO_LOG_ERR, buffer, len);
+		ret = fio_server_text_output(FIO_LOG_ERR, buffer, len);
 		if (ret != -1)
-			return ret;
+			goto done;
 	}
 
 	if (log_syslog) {
 		syslog(LOG_INFO, "%s", buffer);
-		return len;
+		ret = len;
 	} else {
-		if (f_err != stderr) {
-			int fio_unused ret;
-
+		if (f_err != stderr)
 			ret = fwrite(buffer, len, 1, stderr);
-		}
 
-		return fwrite(buffer, len, 1, f_err);
+		ret = fwrite(buffer, len, 1, f_err);
 	}
+
+done:
+	free(buffer);
+	return ret;
 }
 
 const char *log_get_level(int level)
diff -Nru fio-2.16/log.h fio-3.1/log.h
--- fio-2.16/log.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/log.h	2017-09-28 10:23:20.000000000 +0000
@@ -16,13 +16,15 @@
 extern size_t log_info_buf(const char *buf, size_t len);
 extern int log_info_flush(void);
 
-#define log_buf(buf, format, args...)		\
-do {						\
-	if ((buf) != NULL)			\
-		__log_buf(buf, format, ##args);	\
-	else					\
-		log_info(format, ##args);	\
-} while (0)
+#define log_buf(buf, format, args...)			\
+({							\
+	size_t __ret;					\
+	if ((buf) != NULL)				\
+		__ret = __log_buf(buf, format, ##args);	\
+	else						\
+		__ret = log_info(format, ##args);	\
+	__ret;						\
+})
 
 enum {
 	FIO_LOG_DEBUG	= 1,
diff -Nru fio-2.16/Makefile fio-3.1/Makefile
--- fio-2.16/Makefile	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/Makefile	2017-09-28 10:23:20.000000000 +0000
@@ -26,7 +26,7 @@
 CFLAGS	= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
 LIBS	+= -lm $(EXTLIBS)
 PROGS	= fio
-SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/fio_latency2csv.py tools/hist/fiologparser_hist.py)
+SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/fio_jsonplus_clat2csv)
 
 ifndef CONFIG_FIO_NO_OPT
   CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2
@@ -36,12 +36,13 @@
   PROGS += gfio
 endif
 
-SOURCE :=	$(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
-		$(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/lib/*.c)) \
+SOURCE :=	$(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
+		$(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/lib/*.c))) \
 		gettime.c ioengines.c init.c stat.c log.c time.c filesetup.c \
 		eta.c verify.c memory.c io_u.c parse.c mutex.c options.c \
 		smalloc.c filehash.c profile.c debug.c engines/cpu.c \
 		engines/mmap.c engines/sync.c engines/null.c engines/net.c \
+		engines/ftruncate.c \
 		server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
 		gettime-thread.c helpers.c json.c idletime.c td_error.c \
 		profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
@@ -106,6 +107,9 @@
 ifndef CONFIG_STRLCAT
   SOURCE += oslib/strlcat.c
 endif
+ifndef CONFIG_HAVE_STRNDUP
+  SOURCE += oslib/strndup.c
+endif
 ifndef CONFIG_GETOPT_LONG_ONLY
   SOURCE += oslib/getopt_long.c
 endif
@@ -139,7 +143,7 @@
   LDFLAGS += -rdynamic
 endif
 ifeq ($(CONFIG_TARGET_OS), Android)
-  SOURCE += diskutil.c fifo.c blktrace.c trim.c profiles/tiobench.c \
+  SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \
 		oslib/linux-dev-lookup.c
   LIBS += -ldl
   LDFLAGS += -rdynamic
@@ -179,7 +183,6 @@
   LIBS	 += -lpthread -ldl
 endif
 ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
-  SOURCE := $(filter-out engines/mmap.c,$(SOURCE))
   SOURCE += os/windows/posix.c
   LIBS	 += -lpthread -lpsapi -lws2_32
   CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format -static
@@ -209,7 +212,8 @@
 
 T_ZIPF_OBS = t/genzipf.o
 T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/pattern.o lib/zipf.o \
-		lib/strntol.o lib/gauss.o t/genzipf.o oslib/strcasestr.o
+		lib/strntol.o lib/gauss.o t/genzipf.o oslib/strcasestr.o \
+		oslib/strndup.o
 T_ZIPF_PROGS = t/fio-genzipf
 
 T_AXMAP_OBJS = t/axmap.o
@@ -222,7 +226,7 @@
 
 T_GEN_RAND_OBJS = t/gen-rand.o
 T_GEN_RAND_OBJS += t/log.o t/debug.o lib/rand.o lib/pattern.o lib/strntol.o \
-			oslib/strcasestr.o
+			oslib/strcasestr.o oslib/strndup.o
 T_GEN_RAND_PROGS = t/gen-rand
 
 ifeq ($(CONFIG_TARGET_OS), Linux)
@@ -234,10 +238,10 @@
 T_DEDUPE_OBJS = t/dedupe.o
 T_DEDUPE_OBJS += lib/rbtree.o t/log.o mutex.o smalloc.o gettime.o crc/md5.o \
 		lib/memalign.o lib/bloom.o t/debug.o crc/xxhash.o t/arch.o \
-		crc/murmur3.o crc/crc32c.o crc/crc32c-intel.o crc/fnv.o
+		crc/murmur3.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o crc/fnv.o
 T_DEDUPE_PROGS = t/fio-dedupe
 
-T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o t/debug.o
+T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o t/debug.o
 T_VS_PROGS = t/fio-verify-state
 
 T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o
@@ -246,6 +250,9 @@
 T_MEMLOCK_OBJS = t/memlock.o
 T_MEMLOCK_PROGS = t/memlock
 
+T_TT_OBJS = t/time-test.o
+T_TT_PROGS = t/time-test
+
 T_OBJS = $(T_SMALLOC_OBJS)
 T_OBJS += $(T_IEEE_OBJS)
 T_OBJS += $(T_ZIPF_OBJS)
@@ -257,6 +264,7 @@
 T_OBJS += $(T_VS_OBJS)
 T_OBJS += $(T_PIPE_ASYNC_OBJS)
 T_OBJS += $(T_MEMLOCK_OBJS)
+T_OBJS += $(T_TT_OBJS)
 
 ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
     T_DEDUPE_OBJS += os/windows/posix.o lib/hweight.o
@@ -304,7 +312,7 @@
 
 all: $(PROGS) $(T_TEST_PROGS) $(SCRIPTS) FORCE
 
-.PHONY: all install clean
+.PHONY: all install clean test
 .PHONY: FORCE cscope
 
 FIO-VERSION-FILE: FORCE
@@ -319,8 +327,13 @@
 	@$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d
 	@mv -f $*.d $*.d.tmp
 	@sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d
+ifeq ($(CONFIG_TARGET_OS), NetBSD)
+	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | tr -cs "[:graph:]" "\n" | \
+		sed -e 's/^ *//' -e '/^$$/ d' -e 's/$$/:/' >> $*.d
+else
 	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 | \
 		sed -e 's/^ *//' -e 's/$$/:/' >> $*.d
+endif
 	@rm -f $*.d.tmp
 
 ifdef CONFIG_ARITHMETIC
@@ -358,8 +371,13 @@
 	@$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d
 	@mv -f $*.d $*.d.tmp
 	@sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d
+ifeq ($(CONFIG_TARGET_OS), NetBSD)
+	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | tr -cs "[:graph:]" "\n" | \
+		sed -e 's/^ *//' -e '/^$$/ d' -e 's/$$/:/' >> $*.d
+else
 	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 | \
 		sed -e 's/^ *//' -e 's/$$/:/' >> $*.d
+endif
 	@rm -f $*.d.tmp
 
 gcompat.o: gcompat.c gcompat.h
@@ -430,8 +448,12 @@
 t/fio-verify-state: $(T_VS_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
 
+t/time-test: $(T_TT_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_TT_OBJS) $(LIBS)
+
 clean: FORCE
 	@rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio FIO-VERSION-FILE *.d lib/*.d oslib/*.d crc/*.d engines/*.d profiles/*.d t/*.d config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
+	@rm -rf  doc/output
 
 distclean: clean FORCE
 	@rm -f cscope.out fio.pdf fio_generate_plots.pdf fio2gnuplot.pdf fiologparser_hist.pdf
@@ -448,7 +470,8 @@
 	@man -t tools/plot/fio2gnuplot.1 | ps2pdf - fio2gnuplot.pdf
 	@man -t tools/hist/fiologparser_hist.py.1 | ps2pdf - fiologparser_hist.pdf
 
-test:
+test: fio
+	./fio --minimal --thread --exitall_on_error --runtime=1s --name=nulltest --ioengine=null --rw=randrw --iodepth=2 --norandommap --random_generator=tausworthe64 --size=16T --name=verifyfstest --filename=fiotestfile.tmp --unlink=1 --rw=write --verify=crc32c --verify_state_save=0 --size=16K
 
 install: $(PROGS) $(SCRIPTS) tools/plot/fio2gnuplot.1 FORCE
 	$(INSTALL) -m 755 -d $(DESTDIR)$(bindir)
diff -Nru fio-2.16/memory.c fio-3.1/memory.c
--- fio-2.16/memory.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/memory.c	2017-09-28 10:23:20.000000000 +0000
@@ -33,13 +33,13 @@
 	dprint(FD_MEM, "pinning %llu bytes\n", td->o.lockmem);
 
 	/*
-	 * Don't allow mlock of more than real_mem-128MB
+	 * Don't allow mlock of more than real_mem-128MiB
 	 */
 	phys_mem = os_phys_mem();
 	if (phys_mem) {
 		if ((td->o.lockmem + 128 * 1024 * 1024) > phys_mem) {
 			td->o.lockmem = phys_mem - 128 * 1024 * 1024;
-			log_info("fio: limiting mlocked memory to %lluMB\n",
+			log_info("fio: limiting mlocked memory to %lluMiB\n",
 							td->o.lockmem >> 20);
 		}
 	}
@@ -138,6 +138,9 @@
 	}
 
 	if (td->o.mmapfile) {
+		if (access(td->o.mmapfile, F_OK) == 0)
+			td->flags |= TD_F_MMAP_KEEP;
+
 		td->mmapfd = open(td->o.mmapfile, O_RDWR|O_CREAT, 0644);
 
 		if (td->mmapfd < 0) {
@@ -169,7 +172,7 @@
 		td->orig_buffer = NULL;
 		if (td->mmapfd != 1 && td->mmapfd != -1) {
 			close(td->mmapfd);
-			if (td->o.mmapfile)
+			if (td->o.mmapfile && !(td->flags & TD_F_MMAP_KEEP))
 				unlink(td->o.mmapfile);
 		}
 
@@ -187,7 +190,8 @@
 	if (td->o.mmapfile) {
 		if (td->mmapfd != -1)
 			close(td->mmapfd);
-		unlink(td->o.mmapfile);
+		if (!(td->flags & TD_F_MMAP_KEEP))
+			unlink(td->o.mmapfile);
 		free(td->o.mmapfile);
 	}
 }
@@ -207,6 +211,78 @@
 	free(td->orig_buffer);
 }
 
+static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem)
+{
+#ifdef CONFIG_CUDA
+	CUresult ret;
+	char name[128];
+
+	ret = cuInit(0);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed initialize cuda driver api\n");
+		return 1;
+	}
+
+	ret = cuDeviceGetCount(&td->gpu_dev_cnt);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get device count\n");
+		return 1;
+	}
+	dprint(FD_MEM, "found %d GPU devices\n", td->gpu_dev_cnt);
+
+	if (td->gpu_dev_cnt == 0) {
+		log_err("fio: no GPU device found. "
+			"Can not perform GPUDirect RDMA.\n");
+		return 1;
+	}
+
+	td->gpu_dev_id = td->o.gpu_dev_id;
+	ret = cuDeviceGet(&td->cu_dev, td->gpu_dev_id);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get GPU device\n");
+		return 1;
+	}
+
+	ret = cuDeviceGetName(name, sizeof(name), td->gpu_dev_id);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get device name\n");
+		return 1;
+	}
+	dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \
+	       td->gpu_dev_id, name);
+
+	ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed to create cuda context: %d\n", ret);
+		return 1;
+	}
+
+	ret = cuMemAlloc(&td->dev_mem_ptr, total_mem);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: cuMemAlloc %zu bytes failed\n", total_mem);
+		return 1;
+	}
+	td->orig_buffer = (void *) td->dev_mem_ptr;
+
+	dprint(FD_MEM, "cudaMalloc %llu %p\n",				\
+	       (unsigned long long) total_mem, td->orig_buffer);
+	return 0;
+#else
+	return -EINVAL;
+#endif
+}
+
+static void free_mem_cudamalloc(struct thread_data *td)
+{
+#ifdef CONFIG_CUDA
+	if (td->dev_mem_ptr != NULL)
+		cuMemFree(td->dev_mem_ptr);
+
+	if (cuCtxDestroy(td->cu_ctx) != CUDA_SUCCESS)
+		log_err("fio: failed to destroy cuda context\n");
+#endif
+}
+
 /*
  * Set up the buffer area we need for io.
  */
@@ -246,6 +322,8 @@
 	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
 		 td->o.mem_type == MEM_MMAPSHARED)
 		ret = alloc_mem_mmap(td, total_mem);
+	else if (td->o.mem_type == MEM_CUDA_MALLOC)
+		ret = alloc_mem_cudamalloc(td, total_mem);
 	else {
 		log_err("fio: bad mem type: %d\n", td->o.mem_type);
 		ret = 1;
@@ -275,6 +353,8 @@
 	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
 		 td->o.mem_type == MEM_MMAPSHARED)
 		free_mem_mmap(td, total_mem);
+	else if (td->o.mem_type == MEM_CUDA_MALLOC)
+		free_mem_cudamalloc(td);
 	else
 		log_err("Bad memory type %u\n", td->o.mem_type);
 
diff -Nru fio-2.16/mutex.c fio-3.1/mutex.c
--- fio-2.16/mutex.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/mutex.c	2017-09-28 10:23:20.000000000 +0000
@@ -47,7 +47,7 @@
 		return ret;
 	}
 
-#ifdef FIO_HAVE_PSHARED_MUTEX
+#ifdef CONFIG_PSHARED
 	ret = pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
 	if (ret) {
 		log_err("pthread_condattr_setpshared: %s\n", strerror(ret));
@@ -77,7 +77,7 @@
 	/*
 	 * Not all platforms support process shared mutexes (FreeBSD)
 	 */
-#ifdef FIO_HAVE_PSHARED_MUTEX
+#ifdef CONFIG_PSHARED
 	ret = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
 	if (ret) {
 		log_err("pthread_mutexattr_setpshared: %s\n", strerror(ret));
@@ -141,11 +141,15 @@
 	return NULL;
 }
 
-static bool mutex_timed_out(struct timeval *t, unsigned int msecs)
+static bool mutex_timed_out(struct timespec *t, unsigned int msecs)
 {
-	struct timeval now;
+	struct timeval tv;
+	struct timespec now;
+
+	gettimeofday(&tv, NULL);
+	now.tv_sec = tv.tv_sec;
+	now.tv_nsec = tv.tv_usec * 1000;
 
-	gettimeofday(&now, NULL);
 	return mtime_since(t, &now) >= msecs;
 }
 
@@ -177,7 +181,7 @@
 		 * way too early, double check.
 		 */
 		ret = pthread_cond_timedwait(&mutex->cond, &mutex->lock, &t);
-		if (ret == ETIMEDOUT && !mutex_timed_out(&tv_s, msecs))
+		if (ret == ETIMEDOUT && !mutex_timed_out(&t, msecs))
 			ret = 0;
 	}
 	mutex->waiters--;
@@ -287,7 +291,7 @@
 		log_err("pthread_rwlockattr_init: %s\n", strerror(ret));
 		goto err;
 	}
-#ifdef FIO_HAVE_PSHARED_MUTEX
+#ifdef CONFIG_PSHARED
 	ret = pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
 	if (ret) {
 		log_err("pthread_rwlockattr_setpshared: %s\n", strerror(ret));
diff -Nru fio-2.16/optgroup.c fio-3.1/optgroup.c
--- fio-2.16/optgroup.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/optgroup.c	2017-09-28 10:23:20.000000000 +0000
@@ -31,16 +31,16 @@
 		.mask	= FIO_OPT_C_PROFILE,
 	},
 	{
+		.name	= "I/O engines",
+		.mask	= FIO_OPT_C_ENGINE,
+	},
+	{
 		.name	= NULL,
 	},
 };
 
 static const struct opt_group fio_opt_cat_groups[] = {
 	{
-		.name	= "Latency profiling",
-		.mask	= FIO_OPT_G_LATPROF,
-	},
-	{
 		.name	= "Rate",
 		.mask	= FIO_OPT_G_RATE,
 	},
@@ -125,13 +125,52 @@
 		.mask	= FIO_OPT_G_TIOBENCH,
 	},
 	{
-		.name	= "MTD",
+		.name	= "Error handling",
+		.mask	= FIO_OPT_G_ERR,
+	},
+	{
+		.name	= "Ext4 defrag I/O engine", /* e4defrag */
+		.mask	= FIO_OPT_G_E4DEFRAG,
+	},
+	{
+		.name	= "Network I/O engine", /* net */
+		.mask	= FIO_OPT_G_NETIO,
+	},
+	{
+		.name	= "RDMA I/O engine", /* rdma */
+		.mask	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "libaio I/O engine", /* libaio */
+		.mask	= FIO_OPT_G_LIBAIO,
+	},
+	{
+		.name	= "ACT Aerospike like benchmark profile",
+		.mask	= FIO_OPT_G_ACT,
+	},
+	{
+		.name	= "Latency profiling",
+		.mask	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "RBD I/O engine", /* rbd */
+		.mask	= FIO_OPT_G_RBD,
+	},
+	{
+		.name	= "GlusterFS I/O engine", /* gfapi,gfapi_async */
+		.mask	= FIO_OPT_G_GFAPI,
+	},
+	{
+		.name	= "MTD I/O engine", /* mtd */
 		.mask	= FIO_OPT_G_MTD,
 	},
-
+	{
+		.name	= "libhdfs I/O engine", /* libhdfs */
+		.mask	= FIO_OPT_G_HDFS,
+	},
 	{
 		.name	= NULL,
-	}
+	},
 };
 
 static const struct opt_group *group_from_mask(const struct opt_group *ogs,
diff -Nru fio-2.16/options.c fio-3.1/options.c
--- fio-2.16/options.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/options.c	2017-09-28 10:23:20.000000000 +0000
@@ -270,7 +270,8 @@
 	return 0;
 }
 
-static int ignore_error_type(struct thread_data *td, int etype, char *str)
+static int ignore_error_type(struct thread_data *td, enum error_type_bit etype,
+				char *str)
 {
 	unsigned int i;
 	int *error;
@@ -282,7 +283,7 @@
 	}
 
 	td->o.ignore_error_nr[etype] = 4;
-	error = malloc(4 * sizeof(struct bssplit));
+	error = calloc(4, sizeof(int));
 
 	i = 0;
 	while ((fname = strsep(&str, ":")) != NULL) {
@@ -306,8 +307,9 @@
 				error[i] = -error[i];
 		}
 		if (!error[i]) {
-			log_err("Unknown error %s, please use number value \n",
+			log_err("Unknown error %s, please use number value\n",
 				  fname);
+			td->o.ignore_error_nr[etype] = 0;
 			free(error);
 			return 1;
 		}
@@ -317,8 +319,10 @@
 		td->o.continue_on_error |= 1 << etype;
 		td->o.ignore_error_nr[etype] = i;
 		td->o.ignore_error[etype] = error;
-	} else
+	} else {
+		td->o.ignore_error_nr[etype] = 0;
 		free(error);
+	}
 
 	return 0;
 
@@ -328,7 +332,8 @@
 {
 	struct thread_data *td = cb_data_to_td(data);
 	char *str, *p, *n;
-	int type = 0, ret = 1;
+	int ret = 1;
+	enum error_type_bit type = 0;
 
 	if (parse_dryrun())
 		return 0;
@@ -1233,6 +1238,9 @@
 	strip_blank_front(&str);
 	strip_blank_end(str);
 
+	/*
+	 * Ignore what we may already have from nrfiles option.
+	 */
 	if (!td->files_index)
 		td->o.nr_files = 0;
 
@@ -1303,8 +1311,17 @@
 
 	assert(ret != 0);
 	td->o.buffer_pattern_bytes = ret;
-	if (!td->o.compress_percentage)
+
+	/*
+	 * If this job is doing any reading or has compression set,
+	 * ensure that we refill buffers for writes or we could be
+	 * invalidating the pattern through reads.
+	 */
+	if (!td->o.compress_percentage && !td_read(td))
 		td->o.refill_buffers = 0;
+	else
+		td->o.refill_buffers = 1;
+
 	td->o.scramble_buffers = 0;
 	td->o.zero_buffers = 0;
 
@@ -1364,7 +1381,23 @@
 	td->o.disable_bw = !!val;
 	td->o.clat_percentiles = !val;
 	if (val)
-		td->tv_cache_mask = 63;
+		td->ts_cache_mask = 63;
+
+	return 0;
+}
+
+static int str_offset_cb(void *data, unsigned long long *__val)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	unsigned long long v = *__val;
+
+	if (parse_is_percent(v)) {
+		td->o.start_offset = 0;
+		td->o.start_offset_percent = -1ULL - v;
+		dprint(FD_PARSE, "SET start_offset_percent %d\n",
+					td->o.start_offset_percent);
+	} else
+		td->o.start_offset = v;
 
 	return 0;
 }
@@ -1377,6 +1410,8 @@
 	if (parse_is_percent(v)) {
 		td->o.size = 0;
 		td->o.size_percent = -1ULL - v;
+		dprint(FD_PARSE, "SET size_percent %d\n",
+					td->o.size_percent);
 	} else
 		td->o.size = v;
 
@@ -1427,6 +1462,39 @@
 	return 0;
 }
 
+/*
+ * str is supposed to be a substring of the strdup'd original string,
+ * and is valid only if it's a regular file path.
+ * This function keeps the pointer to the path as needed later.
+ *
+ * "external:/path/to/so\0" <- original pointer updated with strdup'd
+ * "external\0"             <- above pointer after parsed, i.e. ->ioengine
+ *          "/path/to/so\0" <- str argument, i.e. ->ioengine_so_path
+ */
+static int str_ioengine_external_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	struct stat sb;
+	char *p;
+
+	if (!str) {
+		log_err("fio: null external ioengine path\n");
+		return 1;
+	}
+
+	p = (char *)str; /* str is mutable */
+	strip_blank_front(&p);
+	strip_blank_end(p);
+
+	if (stat(p, &sb) || !S_ISREG(sb.st_mode)) {
+		log_err("fio: invalid external ioengine path \"%s\"\n", p);
+		return 1;
+	}
+
+	td->o.ioengine_so_path = p;
+	return 0;
+}
+
 static int rw_verify(struct fio_option *o, void *data)
 {
 	struct thread_data *td = cb_data_to_td(data);
@@ -1777,6 +1845,7 @@
 #endif
 			  { .ival = "external",
 			    .help = "Load external engine (append name)",
+			    .cb = str_ioengine_external_cb,
 			  },
 		},
 	},
@@ -1847,6 +1916,17 @@
 		.group	= FIO_OPT_G_IO_BASIC,
 	},
 	{
+		.name	= "serialize_overlap",
+		.lname	= "Serialize overlap",
+		.off1	= offsetof(struct thread_options, serialize_overlap),
+		.type	= FIO_OPT_BOOL,
+		.help	= "Wait for in-flight IOs that collide to complete",
+		.parent	= "iodepth",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
+	{
 		.name	= "io_submit_mode",
 		.lname	= "IO submit mode",
 		.type	= FIO_OPT_STR,
@@ -1882,7 +1962,8 @@
 		.alias	= "io_limit",
 		.lname	= "IO Size",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= offsetof(struct thread_options, io_limit),
+		.off1	= offsetof(struct thread_options, io_size),
+		.help	= "Total size of I/O to be performed",
 		.interval = 1024 * 1024,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_INVALID,
@@ -1925,6 +2006,7 @@
 		.lname	= "IO offset",
 		.alias	= "fileoffset",
 		.type	= FIO_OPT_STR_VAL,
+		.cb	= str_offset_cb,
 		.off1	= offsetof(struct thread_options, start_offset),
 		.help	= "Start IO from this offset",
 		.def	= "0",
@@ -1965,7 +2047,7 @@
 		.off3	= offsetof(struct thread_options, bs[DDIR_TRIM]),
 		.minval = 1,
 		.help	= "Block size unit",
-		.def	= "4k",
+		.def	= "4096",
 		.parent = "rw",
 		.hide	= 1,
 		.interval = 512,
@@ -2232,9 +2314,13 @@
 			    .oval = FIO_FSERVICE_PARETO,
 			    .help = "Pareto randomized",
 			  },
+			  { .ival = "normal",
+			    .oval = FIO_FSERVICE_GAUSS,
+			    .help = "Normal (Gaussian) randomized",
+			  },
 			  { .ival = "gauss",
 			    .oval = FIO_FSERVICE_GAUSS,
-			    .help = "Normal (gaussian) distribution",
+			    .help = "Alias for normal",
 			  },
 			  { .ival = "roundrobin",
 			    .oval = FIO_FSERVICE_RR,
@@ -2248,14 +2334,14 @@
 		.parent = "nrfiles",
 		.hide	= 1,
 	},
-#ifdef CONFIG_POSIX_FALLOCATE
+#ifdef FIO_HAVE_ANY_FALLOCATE
 	{
 		.name	= "fallocate",
 		.lname	= "Fallocate",
 		.type	= FIO_OPT_STR,
 		.off1	= offsetof(struct thread_options, fallocate_mode),
 		.help	= "Whether pre-allocation is performed when laying out files",
-		.def	= "posix",
+		.def	= "native",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
 		.posval	= {
@@ -2263,10 +2349,16 @@
 			    .oval = FIO_FALLOCATE_NONE,
 			    .help = "Do not pre-allocate space",
 			  },
+			  { .ival = "native",
+			    .oval = FIO_FALLOCATE_NATIVE,
+			    .help = "Use native pre-allocation if possible",
+			  },
+#ifdef CONFIG_POSIX_FALLOCATE
 			  { .ival = "posix",
 			    .oval = FIO_FALLOCATE_POSIX,
 			    .help = "Use posix_fallocate()",
 			  },
+#endif
 #ifdef CONFIG_LINUX_FALLOCATE
 			  { .ival = "keep",
 			    .oval = FIO_FALLOCATE_KEEP_SIZE,
@@ -2278,20 +2370,22 @@
 			    .oval = FIO_FALLOCATE_NONE,
 			    .help = "Alias for 'none'",
 			  },
+#ifdef CONFIG_POSIX_FALLOCATE
 			  { .ival = "1",
 			    .oval = FIO_FALLOCATE_POSIX,
 			    .help = "Alias for 'posix'",
 			  },
+#endif
 		},
 	},
-#else	/* CONFIG_POSIX_FALLOCATE */
+#else	/* FIO_HAVE_ANY_FALLOCATE */
 	{
 		.name	= "fallocate",
 		.lname	= "Fallocate",
 		.type	= FIO_OPT_UNSUPPORTED,
 		.help	= "Your platform does not support fallocate",
 	},
-#endif /* CONFIG_POSIX_FALLOCATE */
+#endif /* FIO_HAVE_ANY_FALLOCATE */
 	{
 		.name	= "fadvise_hint",
 		.lname	= "Fadvise hint",
@@ -2320,24 +2414,6 @@
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
 	},
-#ifdef FIO_HAVE_STREAMID
-	{
-		.name	= "fadvise_stream",
-		.lname	= "Fadvise stream",
-		.type	= FIO_OPT_INT,
-		.off1	= offsetof(struct thread_options, fadvise_stream),
-		.help	= "Use fadvise() to set stream ID",
-		.category = FIO_OPT_C_FILE,
-		.group	= FIO_OPT_G_INVALID,
-	},
-#else
-	{
-		.name	= "fadvise_stream",
-		.lname	= "Fadvise stream",
-		.type	= FIO_OPT_UNSUPPORTED,
-		.help	= "Your platform does not support fadvise stream ID",
-	},
-#endif
 	{
 		.name	= "fsync",
 		.lname	= "Fsync",
@@ -2600,6 +2676,12 @@
 			    .help = "Like mmap, but use huge pages",
 			  },
 #endif
+#ifdef CONFIG_CUDA
+			  { .ival = "cudamalloc",
+			    .oval = MEM_CUDA_MALLOC,
+			    .help = "Allocate GPU device memory for GPUDirect RDMA",
+			  },
+#endif
 		  },
 	},
 	{
@@ -2670,6 +2752,22 @@
 			    .oval = VERIFY_SHA512,
 			    .help = "Use sha512 checksums for verification",
 			  },
+			  { .ival = "sha3-224",
+			    .oval = VERIFY_SHA3_224,
+			    .help = "Use sha3-224 checksums for verification",
+			  },
+			  { .ival = "sha3-256",
+			    .oval = VERIFY_SHA3_256,
+			    .help = "Use sha3-256 checksums for verification",
+			  },
+			  { .ival = "sha3-384",
+			    .oval = VERIFY_SHA3_384,
+			    .help = "Use sha3-384 checksums for verification",
+			  },
+			  { .ival = "sha3-512",
+			    .oval = VERIFY_SHA3_512,
+			    .help = "Use sha3-512 checksums for verification",
+			  },
 			  { .ival = "xxhash",
 			    .oval = VERIFY_XXHASH,
 			    .help = "Use xxhash checksums for verification",
@@ -2885,7 +2983,7 @@
 		.off1	= offsetof(struct thread_options, trim_percentage),
 		.minval = 0,
 		.maxval = 100,
-		.help	= "Number of verify blocks to discard/trim",
+		.help	= "Number of verify blocks to trim (i.e., discard)",
 		.parent	= "verify",
 		.def	= "0",
 		.interval = 1,
@@ -2897,7 +2995,7 @@
 		.name	= "trim_verify_zero",
 		.lname	= "Verify trim zero",
 		.type	= FIO_OPT_BOOL,
-		.help	= "Verify that trim/discarded blocks are returned as zeroes",
+		.help	= "Verify that trimmed (i.e., discarded) blocks are returned as zeroes",
 		.off1	= offsetof(struct thread_options, trim_zero),
 		.parent	= "trim_percentage",
 		.hide	= 1,
@@ -3377,6 +3475,34 @@
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_TYPE,
 	},
+#ifdef FIO_HAVE_WRITE_HINT
+	{
+		.name	= "write_hint",
+		.lname	= "Write hint",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, write_hint),
+		.help	= "Set expected write life time",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+		.posval = {
+			  { .ival = "none",
+			    .oval = RWH_WRITE_LIFE_NONE,
+			  },
+			  { .ival = "short",
+			    .oval = RWH_WRITE_LIFE_SHORT,
+			  },
+			  { .ival = "medium",
+			    .oval = RWH_WRITE_LIFE_MEDIUM,
+			  },
+			  { .ival = "long",
+			    .oval = RWH_WRITE_LIFE_LONG,
+			  },
+			  { .ival = "extreme",
+			    .oval = RWH_WRITE_LIFE_EXTREME,
+			  },
+		},
+	},
+#endif
 	{
 		.name	= "create_serialize",
 		.lname	= "Create serialize",
@@ -3543,6 +3669,18 @@
 		.help	= "Build fio with libnuma-dev(el) to enable this option",
 	},
 #endif
+#ifdef CONFIG_CUDA
+	{
+		.name	= "gpu_dev_id",
+		.lname	= "GPU device ID",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, gpu_dev_id),
+		.help	= "Set GPU device ID for GPUDirect RDMA",
+		.def    = "0",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#endif
 	{
 		.name	= "end_fsync",
 		.lname	= "End fsync",
@@ -3846,6 +3984,16 @@
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "stats",
+		.lname	= "Stats",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, stats),
+		.help	= "Enable collection of stats",
+		.def	= "1",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "zero_buffers",
 		.lname	= "Zero I/O buffers",
 		.type	= FIO_OPT_STR_SET,
@@ -3928,6 +4076,18 @@
 		.off1	= offsetof(struct thread_options, clat_percentiles),
 		.help	= "Enable the reporting of completion latency percentiles",
 		.def	= "1",
+		.inverse = "lat_percentiles",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "lat_percentiles",
+		.lname	= "IO latency percentiles",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, lat_percentiles),
+		.help	= "Enable the reporting of IO latency percentiles",
+		.def	= "0",
+		.inverse = "clat_percentiles",
 		.category = FIO_OPT_C_STAT,
 		.group	= FIO_OPT_G_INVALID,
 	},
@@ -4180,20 +4340,20 @@
 		.posval = {
 			  { .ival = "1024",
 			    .oval = 1024,
-			    .help = "Use 1024 as the K base",
+			    .help = "Inputs invert IEC and SI prefixes (for compatibility); outputs prefer binary",
 			  },
 			  { .ival = "1000",
 			    .oval = 1000,
-			    .help = "Use 1000 as the K base",
+			    .help = "Inputs use IEC and SI prefixes; outputs prefer SI",
 			  },
 		},
-		.help	= "How many bytes per KB for reporting (1000 or 1024)",
+		.help	= "Unit prefix interpretation for quantities of data (IEC and SI)",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
 		.name	= "unit_base",
-		.lname	= "Base unit for reporting (Bits or Bytes)",
+		.lname	= "Unit for quantities of data (Bits or Bytes)",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct thread_options, unit_base),
 		.prio	= 1,
@@ -4276,17 +4436,6 @@
 		.group	= FIO_OPT_G_IO_FLOW,
 	},
 	{
-		.name	= "skip_bad",
-		.lname	= "Skip operations against bad blocks",
-		.type	= FIO_OPT_BOOL,
-		.off1	= offsetof(struct thread_options, skip_bad),
-		.help	= "Skip operations against known bad blocks.",
-		.hide	= 1,
-		.def	= "0",
-		.category = FIO_OPT_C_IO,
-		.group	= FIO_OPT_G_MTD,
-	},
-	{
 		.name   = "steadystate",
 		.lname  = "Steady state threshold",
 		.alias  = "ss",
@@ -4321,6 +4470,7 @@
 		.name   = "steadystate_duration",
 		.lname  = "Steady state duration",
 		.alias  = "ss_dur",
+		.parent	= "steadystate",
 		.type   = FIO_OPT_STR_VAL_TIME,
 		.off1   = offsetof(struct thread_options, ss_dur),
 		.help   = "Stop workload upon attaining steady state for specified duration",
@@ -4334,6 +4484,7 @@
 		.name   = "steadystate_ramp_time",
 		.lname  = "Steady state ramp time",
 		.alias  = "ss_ramp",
+		.parent	= "steadystate",
 		.type   = FIO_OPT_STR_VAL_TIME,
 		.off1   = offsetof(struct thread_options, ss_ramp_time),
 		.help   = "Delay before initiation of data collection for steady state job termination testing",
@@ -4769,34 +4920,19 @@
 	return show_cmd_help(fio_options, opt);
 }
 
-void options_mem_dupe(void *data, struct fio_option *options)
-{
-	struct fio_option *o;
-	char **ptr;
-
-	for (o = &options[0]; o->name; o++) {
-		if (o->type != FIO_OPT_STR_STORE)
-			continue;
-
-		ptr = td_var(data, o, o->off1);
-		if (*ptr)
-			*ptr = strdup(*ptr);
-	}
-}
-
 /*
  * dupe FIO_OPT_STR_STORE options
  */
 void fio_options_mem_dupe(struct thread_data *td)
 {
-	options_mem_dupe(&td->o, fio_options);
+	options_mem_dupe(fio_options, &td->o);
 
 	if (td->eo && td->io_ops) {
 		void *oldeo = td->eo;
 
 		td->eo = malloc(td->io_ops->option_struct_size);
 		memcpy(td->eo, oldeo, td->io_ops->option_struct_size);
-		options_mem_dupe(td->eo, td->io_ops->options);
+		options_mem_dupe(td->io_ops->options, td->eo);
 	}
 }
 
diff -Nru fio-2.16/os/os-aix.h fio-3.1/os/os-aix.h
--- fio-2.16/os/os-aix.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-aix.h	2017-09-28 10:23:20.000000000 +0000
@@ -14,8 +14,6 @@
 #define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
 
-#define FIO_HAVE_PSHARED_MUTEX
-
 #define OS_MAP_ANON		MAP_ANON
 #define OS_MSG_DONTWAIT		0
 
@@ -23,7 +21,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
diff -Nru fio-2.16/os/os-android.h fio-3.1/os/os-android.h
--- fio-2.16/os/os-android.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-android.h	2017-09-28 10:23:20.000000000 +0000
@@ -7,6 +7,7 @@
 #include <sys/mman.h>
 #include <sys/uio.h>
 #include <sys/syscall.h>
+#include <sys/sysmacros.h>
 #include <sys/vfs.h>
 #include <unistd.h>
 #include <fcntl.h>
@@ -20,6 +21,10 @@
 #include "binject.h"
 #include "../file.h"
 
+#ifndef __has_builtin         // Optional of course.
+  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
 #define FIO_HAVE_DISK_UTIL
 #define FIO_HAVE_IOSCHED_SWITCH
 #define FIO_HAVE_IOPRIO
@@ -27,8 +32,8 @@
 #define FIO_HAVE_ODIRECT
 #define FIO_HAVE_HUGETLB
 #define FIO_HAVE_BLKTRACE
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CL_SIZE
+#define FIO_HAVE_CGROUPS
 #define FIO_HAVE_FS_STAT
 #define FIO_HAVE_TRIM
 #define FIO_HAVE_GETTID
@@ -54,22 +59,19 @@
 #define MAP_HUGETLB 0x40000 /* arch specific */
 #endif
 
-
+#ifndef CONFIG_NO_SHM
 /*
- * The Android NDK doesn't currently export <sys/shm.h>, so define the
- * necessary stuff here.
+ * Bionic doesn't support SysV shared memeory, so implement it using ashmem
  */
-
-#include <linux/shm.h>
-#define SHM_HUGETLB    04000
-
 #include <stdio.h>
 #include <linux/ashmem.h>
-#include <sys/mman.h>
+#include <linux/shm.h>
+#define shmid_ds shmid64_ds
+#define SHM_HUGETLB    04000
 
 #define ASHMEM_DEVICE	"/dev/ashmem"
 
-static inline int shmctl (int __shmid, int __cmd, struct shmid_ds *__buf)
+static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
 {
 	int ret=0;
 	if (__cmd == IPC_RMID)
@@ -82,47 +84,50 @@
 	return ret;
 }
 
-static inline int shmget (key_t __key, size_t __size, int __shmflg)
+static inline int shmget(key_t __key, size_t __size, int __shmflg)
 {
 	int fd,ret;
-	char key[11];
-	
+	char keybuf[11];
+
 	fd = open(ASHMEM_DEVICE, O_RDWR);
 	if (fd < 0)
 		return fd;
 
-	sprintf(key,"%d",__key);
-	ret = ioctl(fd, ASHMEM_SET_NAME, key);
+	sprintf(keybuf,"%d",__key);
+	ret = ioctl(fd, ASHMEM_SET_NAME, keybuf);
 	if (ret < 0)
 		goto error;
 
-	ret = ioctl(fd, ASHMEM_SET_SIZE, __size);
+	/* Stores size in first 8 bytes, allocate extra space */
+	ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t));
 	if (ret < 0)
 		goto error;
 
 	return fd;
-	
+
 error:
 	close(fd);
 	return ret;
 }
 
-static inline void *shmat (int __shmid, const void *__shmaddr, int __shmflg)
+static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg)
 {
-	size_t *ptr, size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
-	ptr = mmap(NULL, size + sizeof(size_t), PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
-	*ptr = size;    //save size at beginning of buffer, for use with munmap
-	return &ptr[1];
+	size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
+	/* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */
+	uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
+	/* Save size at beginning of buffer, for use with munmap */
+	*ptr = size;
+	return ptr + 1;
 }
 
 static inline int shmdt (const void *__shmaddr)
 {
-	size_t *ptr, size;
-	ptr = (size_t *)__shmaddr;
-	ptr--;
-	size = *ptr;    //find mmap size which we stored at the beginning of the buffer
-	return munmap((void *)ptr, size + sizeof(size_t));
+	/* Find mmap size which we stored at the beginning of the buffer */
+	uint64_t *ptr = (uint64_t *)__shmaddr - 1;
+	size_t size = *ptr;
+	return munmap(ptr, size);
 }
+#endif
 
 #define SPLICE_DEF_SIZE	(64*1024)
 
@@ -220,9 +225,19 @@
 #define FIO_O_NOATIME	0
 #endif
 
-#define fio_swap16(x)	__bswap_16(x)
-#define fio_swap32(x)	__bswap_32(x)
-#define fio_swap64(x)	__bswap_64(x)
+/* Check for GCC or Clang byte swap intrinsics */
+#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \
+     && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \
+     || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */
+#define fio_swap16(x)	__builtin_bswap16(x)
+#define fio_swap32(x)	__builtin_bswap32(x)
+#define fio_swap64(x)	__builtin_bswap64(x)
+#else
+#include <byteswap.h>
+#define fio_swap16(x)	bswap_16(x)
+#define fio_swap32(x)	bswap_32(x)
+#define fio_swap64(x)	bswap_64(x)
+#endif /* fio_swapN */
 
 #define CACHE_LINE_FILE	\
 	"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
@@ -259,7 +274,7 @@
 	return ret;
 }
 
-static inline int os_trim(int fd, unsigned long long start,
+static inline int os_trim(struct fio_file *f, unsigned long long start,
 			  unsigned long long len)
 {
 	uint64_t range[2];
@@ -267,7 +282,7 @@
 	range[0] = start;
 	range[1] = len;
 
-	if (!ioctl(fd, BLKDISCARD, range))
+	if (!ioctl(f->fd, BLKDISCARD, range))
 		return 0;
 
 	return errno;
diff -Nru fio-2.16/os/os-dragonfly.h fio-3.1/os/os-dragonfly.h
--- fio-2.16/os/os-dragonfly.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-dragonfly.h	2017-09-28 10:23:20.000000000 +0000
@@ -5,6 +5,7 @@
 
 #include <errno.h>
 #include <unistd.h>
+#include <sys/endian.h>
 #include <sys/param.h>
 #include <sys/sysctl.h>
 #include <sys/statvfs.h>
@@ -24,6 +25,7 @@
 #define FIO_HAVE_GETTID
 #define FIO_HAVE_CPU_AFFINITY
 #define FIO_HAVE_IOPRIO
+#define FIO_HAVE_SHM_ATTACH_REMOVED
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -183,7 +185,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -214,7 +216,7 @@
 	return ret;
 }
 
-static inline int os_trim(int fd, unsigned long long start,
+static inline int os_trim(struct fio_file *f, unsigned long long start,
 			  unsigned long long len)
 {
 	off_t range[2];
@@ -222,7 +224,7 @@
 	range[0] = start;
 	range[1] = len;
 
-	if (!ioctl(fd, IOCTLTRIM, range))
+	if (!ioctl(f->fd, IOCTLTRIM, range))
 		return 0;
 
 	return errno;
@@ -232,4 +234,15 @@
 #define FIO_MADV_FREE	MADV_FREE
 #endif
 
+static inline int shm_attach_to_open_removed(void)
+{
+	int x;
+	size_t len = sizeof(x);
+
+	if (sysctlbyname("kern.ipc.shm_allow_removed", &x, &len, NULL, 0) < 0)
+		return 0;
+
+	return x > 0 ? 1 : 0;
+}
+
 #endif
diff -Nru fio-2.16/os/os-freebsd.h fio-3.1/os/os-freebsd.h
--- fio-2.16/os/os-freebsd.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-freebsd.h	2017-09-28 10:23:20.000000000 +0000
@@ -6,6 +6,7 @@
 #include <errno.h>
 #include <sys/sysctl.h>
 #include <sys/disk.h>
+#include <sys/endian.h>
 #include <sys/thr.h>
 #include <sys/socket.h>
 #include <sys/param.h>
@@ -22,6 +23,7 @@
 #define FIO_HAVE_TRIM
 #define FIO_HAVE_GETTID
 #define FIO_HAVE_CPU_AFFINITY
+#define FIO_HAVE_SHM_ATTACH_REMOVED
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -81,7 +83,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -115,7 +117,7 @@
 	return ret;
 }
 
-static inline int os_trim(int fd, unsigned long long start,
+static inline int os_trim(struct fio_file *f, unsigned long long start,
 			  unsigned long long len)
 {
 	off_t range[2];
@@ -123,7 +125,7 @@
 	range[0] = start;
 	range[1] = len;
 
-	if (!ioctl(fd, DIOCGDELETE, range))
+	if (!ioctl(f->fd, DIOCGDELETE, range))
 		return 0;
 
 	return errno;
@@ -133,4 +135,15 @@
 #define FIO_MADV_FREE	MADV_FREE
 #endif
 
+static inline int shm_attach_to_open_removed(void)
+{
+	int x;
+	size_t len = sizeof(x);
+
+	if (sysctlbyname("kern.ipc.shm_allow_removed", &x, &len, NULL, 0) < 0)
+		return 0;
+
+	return x > 0 ? 1 : 0;
+}
+
 #endif
diff -Nru fio-2.16/os/os.h fio-3.1/os/os.h
--- fio-2.16/os/os.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os.h	2017-09-28 10:23:20.000000000 +0000
@@ -60,11 +60,6 @@
 #endif
 #endif
 
-#ifdef FIO_HAVE_SGIO
-#include <linux/fs.h>
-#include <scsi/sg.h>
-#endif
-
 #ifndef CONFIG_STRSEP
 #include "../oslib/strsep.h"
 #endif
@@ -81,6 +76,7 @@
 #define POSIX_FADV_DONTNEED	(0)
 #define POSIX_FADV_SEQUENTIAL	(0)
 #define POSIX_FADV_RANDOM	(0)
+#define POSIX_FADV_NORMAL	(0)
 #endif
 
 #ifndef FIO_HAVE_CPU_AFFINITY
@@ -208,16 +204,20 @@
 
 #ifndef FIO_HAVE_BYTEORDER_FUNCS
 #ifdef CONFIG_LITTLE_ENDIAN
+#define __be64_to_cpu(x)		fio_swap64(x)
 #define __le16_to_cpu(x)		(x)
 #define __le32_to_cpu(x)		(x)
 #define __le64_to_cpu(x)		(x)
+#define __cpu_to_be64(x)		fio_swap64(x)
 #define __cpu_to_le16(x)		(x)
 #define __cpu_to_le32(x)		(x)
 #define __cpu_to_le64(x)		(x)
 #else
+#define __be64_to_cpu(x)		(x)
 #define __le16_to_cpu(x)		fio_swap16(x)
 #define __le32_to_cpu(x)		fio_swap32(x)
 #define __le64_to_cpu(x)		fio_swap64(x)
+#define __cpu_to_be64(x)		(x)
 #define __cpu_to_le16(x)		fio_swap16(x)
 #define __cpu_to_le32(x)		fio_swap32(x)
 #define __cpu_to_le64(x)		fio_swap64(x)
@@ -225,6 +225,10 @@
 #endif /* FIO_HAVE_BYTEORDER_FUNCS */
 
 #ifdef FIO_INTERNAL
+#define be64_to_cpu(val) ({			\
+	typecheck(uint64_t, val);		\
+	__be64_to_cpu(val);			\
+})
 #define le16_to_cpu(val) ({			\
 	typecheck(uint16_t, val);		\
 	__le16_to_cpu(val);			\
@@ -239,6 +243,10 @@
 })
 #endif
 
+#define cpu_to_be64(val) ({			\
+	typecheck(uint64_t, val);		\
+	__cpu_to_be64(val);			\
+})
 #define cpu_to_le16(val) ({			\
 	typecheck(uint16_t, val);		\
 	__cpu_to_le16(val);			\
@@ -252,19 +260,6 @@
 	__cpu_to_le64(val);			\
 })
 
-#ifndef FIO_HAVE_BLKTRACE
-static inline int is_blktrace(const char *fname, int *need_swap)
-{
-	return 0;
-}
-struct thread_data;
-static inline int load_blktrace(struct thread_data *td, const char *fname,
-				int need_swap)
-{
-	return 1;
-}
-#endif
-
 #define FIO_DEF_CL_SIZE		128
 
 static inline int os_cache_line_size(void)
@@ -315,12 +310,7 @@
 #endif
 
 #ifdef FIO_USE_GENERIC_INIT_RANDOM_STATE
-extern void td_fill_rand_seeds(struct thread_data *td);
-/*
- * Initialize the various random states we need (random io, block size ranges,
- * read/write mix, etc).
- */
-static inline int init_random_state(struct thread_data *td, unsigned long *rand_seeds, int size)
+static inline int init_random_seeds(unsigned long *rand_seeds, int size)
 {
 	int fd;
 
@@ -335,7 +325,6 @@
 	}
 
 	close(fd);
-	td_fill_rand_seeds(td);
 	return 0;
 }
 #endif
@@ -347,14 +336,6 @@
 }
 #endif
 
-#ifdef __powerpc64__
-#define FIO_HAVE_CPU_ONLINE_SYSCONF
-static inline unsigned int cpus_online(void)
-{
-        return sysconf(_SC_NPROCESSORS_CONF);
-}
-#endif
-
 #ifndef FIO_HAVE_CPU_ONLINE_SYSCONF
 static inline unsigned int cpus_online(void)
 {
@@ -385,4 +366,23 @@
 }
 #endif
 
+#ifndef FIO_HAVE_SHM_ATTACH_REMOVED
+static inline int shm_attach_to_open_removed(void)
+{
+	return 0;
+}
+#endif
+
+#ifndef FIO_HAVE_NATIVE_FALLOCATE
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t len)
+{
+	errno = ENOSYS;
+	return false;
+}
+#endif
+
+#if defined(CONFIG_POSIX_FALLOCATE) || defined(FIO_HAVE_NATIVE_FALLOCATE)
+# define FIO_HAVE_ANY_FALLOCATE
+#endif
+
 #endif
diff -Nru fio-2.16/os/os-hpux.h fio-3.1/os/os-hpux.h
--- fio-2.16/os/os-hpux.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-hpux.h	2017-09-28 10:23:20.000000000 +0000
@@ -22,7 +22,6 @@
 #define FIO_HAVE_ODIRECT
 #define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CHARDEV_SIZE
 
 #define OS_MAP_ANON		MAP_ANONYMOUS
@@ -44,7 +43,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
diff -Nru fio-2.16/os/os-linux.h fio-3.1/os/os-linux.h
--- fio-2.16/os/os-linux.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-linux.h	2017-09-28 10:23:20.000000000 +0000
@@ -16,12 +16,17 @@
 #include <linux/unistd.h>
 #include <linux/raw.h>
 #include <linux/major.h>
-#include <byteswap.h>
+#include <linux/fs.h>
+#include <scsi/sg.h>
 
 #include "./os-linux-syscall.h"
 #include "binject.h"
 #include "../file.h"
 
+#ifndef __has_builtin         // Optional of course.
+  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
 #define FIO_HAVE_CPU_AFFINITY
 #define FIO_HAVE_DISK_UTIL
 #define FIO_HAVE_SGIO
@@ -32,7 +37,6 @@
 #define FIO_HAVE_HUGETLB
 #define FIO_HAVE_RAWBIND
 #define FIO_HAVE_BLKTRACE
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CL_SIZE
 #define FIO_HAVE_CGROUPS
 #define FIO_HAVE_FS_STAT
@@ -41,6 +45,7 @@
 #define FIO_HAVE_GETTID
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
 #define FIO_HAVE_PWRITEV2
+#define FIO_HAVE_SHM_ATTACH_REMOVED
 
 #ifdef MAP_HUGETLB
 #define FIO_HAVE_MMAP_HUGE
@@ -219,21 +224,19 @@
 #define FIO_MADV_FREE	MADV_REMOVE
 #endif
 
-#if defined(__builtin_bswap16)
+/* Check for GCC or Clang byte swap intrinsics */
+#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \
+     && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \
+     || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */
 #define fio_swap16(x)	__builtin_bswap16(x)
-#else
-#define fio_swap16(x)	__bswap_16(x)
-#endif
-#if defined(__builtin_bswap32)
 #define fio_swap32(x)	__builtin_bswap32(x)
-#else
-#define fio_swap32(x)	__bswap_32(x)
-#endif
-#if defined(__builtin_bswap64)
 #define fio_swap64(x)	__builtin_bswap64(x)
 #else
-#define fio_swap64(x)	__bswap_64(x)
-#endif
+#include <byteswap.h>
+#define fio_swap16(x)	bswap_16(x)
+#define fio_swap32(x)	bswap_32(x)
+#define fio_swap64(x)	bswap_64(x)
+#endif /* fio_swapN */
 
 #define CACHE_LINE_FILE	\
 	"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
@@ -257,6 +260,14 @@
 		return atoi(size);
 }
 
+#ifdef __powerpc64__
+#define FIO_HAVE_CPU_ONLINE_SYSCONF
+static inline unsigned int cpus_online(void)
+{
+        return sysconf(_SC_NPROCESSORS_CONF);
+}
+#endif
+
 static inline unsigned long long get_fs_free_size(const char *path)
 {
 	unsigned long long ret;
@@ -270,7 +281,7 @@
 	return ret;
 }
 
-static inline int os_trim(int fd, unsigned long long start,
+static inline int os_trim(struct fio_file *f, unsigned long long start,
 			  unsigned long long len)
 {
 	uint64_t range[2];
@@ -278,7 +289,7 @@
 	range[0] = start;
 	range[1] = len;
 
-	if (!ioctl(fd, BLKDISCARD, range))
+	if (!ioctl(f->fd, BLKDISCARD, range))
 		return 0;
 
 	return errno;
@@ -292,11 +303,26 @@
 }
 #endif
 
-#ifndef POSIX_FADV_STREAMID
-#define POSIX_FADV_STREAMID	8
+#ifndef F_GET_RW_HINT
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE	1024
+#endif
+#define F_GET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 11)
+#define F_SET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 12)
+#define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
+#define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)
+#endif
+
+#ifndef RWH_WRITE_LIFE_NONE
+#define RWH_WRITE_LIFE_NOT_SET	0
+#define RWH_WRITE_LIFE_NONE	1
+#define RWH_WRITE_LIFE_SHORT	2
+#define RWH_WRITE_LIFE_MEDIUM	3
+#define RWH_WRITE_LIFE_LONG	4
+#define RWH_WRITE_LIFE_EXTREME	5
 #endif
 
-#define FIO_HAVE_STREAMID
+#define FIO_HAVE_WRITE_HINT
 
 #ifndef RWF_HIPRI
 #define RWF_HIPRI	0x00000001
@@ -308,14 +334,26 @@
 #define RWF_SYNC	0x00000004
 #endif
 
+#ifndef RWF_WRITE_LIFE_SHIFT
+#define RWF_WRITE_LIFE_SHIFT		4
+#define RWF_WRITE_LIFE_SHORT		(1 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_MEDIUM		(2 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_LONG		(3 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_EXTREME		(4 << RWF_WRITE_LIFE_SHIFT)
+#endif
+
 #ifndef CONFIG_PWRITEV2
 #ifdef __NR_preadv2
 static inline void make_pos_h_l(unsigned long *pos_h, unsigned long *pos_l,
 				off_t offset)
 {
+#if BITS_PER_LONG == 64
+	*pos_l = offset;
+	*pos_h = 0;
+#else
 	*pos_l = offset & 0xffffffff;
 	*pos_h = ((uint64_t) offset) >> 32;
-
+#endif
 }
 static inline ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt,
 			      off_t offset, unsigned int flags)
@@ -349,4 +387,27 @@
 #endif /* __NR_preadv2 */
 #endif /* CONFIG_PWRITEV2 */
 
+static inline int shm_attach_to_open_removed(void)
+{
+	return 1;
+}
+
+#ifdef CONFIG_LINUX_FALLOCATE
+#define FIO_HAVE_NATIVE_FALLOCATE
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset,
+				 uint64_t len)
+{
+	int ret;
+	ret = fallocate(f->fd, 0, 0, len);
+	if (ret == 0)
+		return true;
+
+	/* Work around buggy old glibc versions... */
+	if (ret > 0)
+		errno = ret;
+
+	return false;
+}
+#endif
+
 #endif
diff -Nru fio-2.16/os/os-mac.h fio-3.1/os/os-mac.h
--- fio-2.16/os/os-mac.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-mac.h	2017-09-28 10:23:20.000000000 +0000
@@ -20,6 +20,7 @@
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
 #define FIO_HAVE_GETTID
 #define FIO_HAVE_CHARDEV_SIZE
+#define FIO_HAVE_NATIVE_FALLOCATE
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -40,9 +41,9 @@
 #endif
 
 #define FIO_OS_DIRECTIO
-static inline int fio_set_odirect(int fd)
+static inline int fio_set_odirect(struct fio_file *f)
 {
-	if (fcntl(fd, F_NOCACHE, 1) == -1)
+	if (fcntl(f->fd, F_NOCACHE, 1) == -1)
 		return errno;
 	return 0;
 }
@@ -77,7 +78,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -101,4 +102,15 @@
  */
 extern int fdatasync(int fd);
 
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t len)
+{
+	fstore_t store = {F_ALLOCATEALL, F_PEOFPOSMODE, offset, len};
+	if (fcntl(f->fd, F_PREALLOCATE, &store) != -1) {
+		if (ftruncate(f->fd, len) == 0)
+			return true;
+	}
+
+	return false;
+}
+
 #endif
diff -Nru fio-2.16/os/os-netbsd.h fio-3.1/os/os-netbsd.h
--- fio-2.16/os/os-netbsd.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-netbsd.h	2017-09-28 10:23:20.000000000 +0000
@@ -10,9 +10,10 @@
 #include <sys/ioctl.h>
 #include <sys/dkio.h>
 #include <sys/disklabel.h>
-/* XXX hack to avoid confilcts between rbtree.h and <sys/rb.h> */
-#define	rb_node	_rb_node
+#include <sys/endian.h>
 #include <sys/sysctl.h>
+
+/* XXX hack to avoid confilcts between rbtree.h and <sys/rbtree.h> */
 #undef rb_node
 #undef rb_left
 #undef rb_right
@@ -25,8 +26,6 @@
 #define FIO_HAVE_FS_STAT
 #define FIO_HAVE_GETTID
 
-#undef	FIO_HAVE_CPU_AFFINITY	/* XXX notyet */
-
 #define OS_MAP_ANON		MAP_ANON
 
 #ifndef PTHREAD_STACK_MIN
@@ -54,7 +53,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
diff -Nru fio-2.16/os/os-openbsd.h fio-3.1/os/os-openbsd.h
--- fio-2.16/os/os-openbsd.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-openbsd.h	2017-09-28 10:23:20.000000000 +0000
@@ -9,21 +9,22 @@
 #include <sys/ioctl.h>
 #include <sys/dkio.h>
 #include <sys/disklabel.h>
-/* XXX hack to avoid conflicts between rbtree.h and <sys/tree.h> */
+#include <sys/endian.h>
+#include <sys/utsname.h>
 #include <sys/sysctl.h>
+
+/* XXX hack to avoid conflicts between rbtree.h and <sys/tree.h> */
 #undef RB_BLACK
 #undef RB_RED
 #undef RB_ROOT
 
 #include "../file.h"
 
-#undef  FIO_HAVE_ODIRECT
 #define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
 #define FIO_HAVE_FS_STAT
 #define FIO_HAVE_GETTID
-
-#undef	FIO_HAVE_CPU_AFFINITY	/* XXX notyet */
+#define FIO_HAVE_SHM_ATTACH_REMOVED
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -52,7 +53,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -67,7 +68,7 @@
 
 static inline int gettid(void)
 {
-	return (int) pthread_self();
+	return (int)(intptr_t) pthread_self();
 }
 
 static inline unsigned long long get_fs_free_size(const char *path)
@@ -87,4 +88,34 @@
 #define FIO_MADV_FREE	MADV_FREE
 #endif
 
+static inline int shm_attach_to_open_removed(void)
+{
+	struct utsname uts;
+	int major, minor;
+
+	if (uname(&uts) == -1)
+		return 0;
+
+	/*
+	 * Return 1 if >= OpenBSD 5.1 according to 97900ebf,
+	 * assuming both major/minor versions are < 10.
+	 */
+	if (uts.release[0] > '9' || uts.release[0] < '0')
+		return 0;
+	if (uts.release[1] != '.')
+		return 0;
+	if (uts.release[2] > '9' || uts.release[2] < '0')
+		return 0;
+
+	major = uts.release[0] - '0';
+	minor = uts.release[2] - '0';
+
+	if (major > 5)
+		return 1;
+	if (major == 5 && minor >= 1)
+		return 1;
+
+	return 0;
+}
+
 #endif
diff -Nru fio-2.16/os/os-solaris.h fio-3.1/os/os-solaris.h
--- fio-2.16/os/os-solaris.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-solaris.h	2017-09-28 10:23:20.000000000 +0000
@@ -16,7 +16,6 @@
 #include "../file.h"
 
 #define FIO_HAVE_CPU_AFFINITY
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CHARDEV_SIZE
 #define FIO_USE_GENERIC_BDEV_SIZE
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
@@ -61,7 +60,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return 0;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -86,9 +85,9 @@
 
 #define FIO_OS_DIRECTIO
 extern int directio(int, int);
-static inline int fio_set_odirect(int fd)
+static inline int fio_set_odirect(struct fio_file *f)
 {
-	if (directio(fd, DIRECTIO_ON) < 0)
+	if (directio(f->fd, DIRECTIO_ON) < 0)
 		return errno;
 
 	return 0;
@@ -98,7 +97,7 @@
  * pset binding hooks for fio
  */
 #define fio_setaffinity(pid, cpumask)		\
-	pset_bind((cpumask), P_PID, (pid), NULL)
+	pset_bind((cpumask), P_LWPID, (pid), NULL)
 #define fio_getaffinity(pid, ptr)	({ 0; })
 
 #define fio_cpu_clear(mask, cpu)	pset_assign(PS_NONE, (cpu), NULL)
diff -Nru fio-2.16/os/os-windows.h fio-3.1/os/os-windows.h
--- fio-2.16/os/os-windows.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/os-windows.h	2017-09-28 10:23:20.000000000 +0000
@@ -116,7 +116,6 @@
 ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
 ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
 		off_t offset);
-extern void td_fill_rand_seeds(struct thread_data *);
 
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
 {
@@ -152,9 +151,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	/* There's no way to invalidate the cache in Windows
-	 * so just pretend to succeed */
-	return 0;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -241,7 +238,7 @@
 	return 0;
 }
 
-static inline int init_random_state(struct thread_data *td, unsigned long *rand_seeds, int size)
+static inline int init_random_seeds(unsigned long *rand_seeds, int size)
 {
 	HCRYPTPROV hCryptProv;
 
@@ -260,7 +257,6 @@
 	}
 
 	CryptReleaseContext(hCryptProv, 0);
-	td_fill_rand_seeds(td);
 	return 0;
 }
 
Binary files /tmp/tmptaLXeb/IdYq6qTbHH/fio-2.16/os/windows/eula.rtf and /tmp/tmptaLXeb/svYbzIcWYm/fio-3.1/os/windows/eula.rtf differ
diff -Nru fio-2.16/os/windows/examples.wxs fio-3.1/os/windows/examples.wxs
--- fio-2.16/os/windows/examples.wxs	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/windows/examples.wxs	2017-09-28 10:23:20.000000000 +0000
@@ -9,48 +9,111 @@
                     <File Source="..\..\examples\aio-read.fio" />
                 </Component>
                 <Component>
+                    <File Source="..\..\examples\backwards-read.fio" />
+                </Component>
+                <Component>
+                    <File Source="..\..\examples\basic-verify.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\cpuio.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\dev-dax.fio" />
+                </Component>
+                <Component>
                     <File Source="..\..\examples\disk-zone-profile.fio" />
                 </Component>
                 <Component>
+                  <File Source="..\..\examples\e4defrag.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\e4defrag2.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\enospc-pressure.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\falloc.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fixed-rate-submission.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\flow.fio" />
+                </Component>
+                <Component>
                     <File Source="..\..\examples\fsx.fio" />
                 </Component>
                 <Component>
+                  <File Source="..\..\examples\fusion-aw-sync.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\gfapi.fio" />
+                </Component>
+                <Component>
                     <File Source="..\..\examples\iometer-file-access-server.fio" />
                 </Component>
                 <Component>
+                  <File Source="..\..\examples\jesd219.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\latency-profile.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\libhdfs.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\mtd.fio" />
+                </Component>
+                <Component>
                     <File Source="..\..\examples\netio.fio" />
                 </Component>
                 <Component>
                     <File Source="..\..\examples\netio_multicast.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\ssd-test.fio" />
+                  <File Source="..\..\examples\null.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\surface-scan.fio" />
+                  <File Source="..\..\examples\numa.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\tiobench-example.fio" />
+                  <File Source="..\..\examples\pmemblk.fio" />
                 </Component>
                 <Component>
-                  <File Source="..\..\examples\null.fio" />
+                  <File Source="..\..\examples\poisson-rate-submission.fio" />
                 </Component>
                 <Component>
-                  <File Source="..\..\examples\flow.fio" />
+                  <File Source="..\..\examples\rand-zones.fio" />
                 </Component>
                 <Component>
-                  <File Source="..\..\examples\cpuio.fio" />
+                  <File Source="..\..\examples\rbd.fio" />
                 </Component>
                 <Component>
-                  <File Source="..\..\examples\falloc.fio" />
+                  <File Source="..\..\examples\rdmaio-client.fio" />
                 </Component>
                 <Component>
-                  <File Source="..\..\examples\fusion-aw-sync.fio" />
+                  <File Source="..\..\examples\rdmaio-server.fio" />
                 </Component>
                 <Component>
                   <File Source="..\..\examples\ssd-steadystate.fio" />
                 </Component>
                 <Component>
+                    <File Source="..\..\examples\ssd-test.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\steadystate.fio" />
+                </Component>
+                <Component>
+                    <File Source="..\..\examples\surface-scan.fio" />
+                </Component>
+                <Component>
+                    <File Source="..\..\examples\tiobench-example.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\waitfor.fio" />
+                </Component>
+                <Component>
                   <File Source="..\..\examples\zipf.fio" />
                 </Component>
         </DirectoryRef>
@@ -59,20 +122,41 @@
         <ComponentGroup Id="examples">
             <ComponentRef Id="_1mbs_clients.fio" />
             <ComponentRef Id="aio_read.fio" />
+            <ComponentRef Id="backwards_read.fio" />
+            <ComponentRef Id="basic_verify.fio" />
+            <ComponentRef Id="cpuio.fio" />
+            <ComponentRef Id="dev_dax.fio" />
             <ComponentRef Id="disk_zone_profile.fio" />
+            <ComponentRef Id="e4defrag.fio" />
+            <ComponentRef Id="e4defrag2.fio" />
+            <ComponentRef Id="enospc_pressure.fio" />
+            <ComponentRef Id="falloc.fio" />
+            <ComponentRef Id="fixed_rate_submission.fio" />
+            <ComponentRef Id="flow.fio" />
             <ComponentRef Id="fsx.fio" />
+            <ComponentRef Id="fusion_aw_sync.fio" />
+            <ComponentRef Id="gfapi.fio" />
             <ComponentRef Id="iometer_file_access_server.fio" />
+            <ComponentRef Id="jesd219.fio" />
+            <ComponentRef Id="latency_profile.fio" />
+            <ComponentRef Id="libhdfs.fio" />
+            <ComponentRef Id="mtd.fio" />
             <ComponentRef Id="netio.fio" />
             <ComponentRef Id="netio_multicast.fio" />
+            <ComponentRef Id="null.fio" />
+            <ComponentRef Id="numa.fio" />
+            <ComponentRef Id="pmemblk.fio" />
+            <ComponentRef Id="poisson_rate_submission.fio" />
+            <ComponentRef Id="rand_zones.fio" />
+            <ComponentRef Id="rbd.fio" />
+            <ComponentRef Id="rdmaio_client.fio" />
+            <ComponentRef Id="rdmaio_server.fio" />
+            <ComponentRef Id="ssd_steadystate.fio" />
             <ComponentRef Id="ssd_test.fio" />
+            <ComponentRef Id="steadystate.fio" />
             <ComponentRef Id="surface_scan.fio" />
             <ComponentRef Id="tiobench_example.fio" />
-            <ComponentRef Id="null.fio" />
-            <ComponentRef Id="flow.fio" />
-            <ComponentRef Id="cpuio.fio" />
-            <ComponentRef Id="falloc.fio" />
-            <ComponentRef Id="fusion_aw_sync.fio" />
-            <ComponentRef Id="ssd_steadystate.fio" />
+            <ComponentRef Id="waitfor.fio" />
             <ComponentRef Id="zipf.fio" />
         </ComponentGroup>
     </Fragment>
diff -Nru fio-2.16/os/windows/install.wxs fio-3.1/os/windows/install.wxs
--- fio-2.16/os/windows/install.wxs	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/windows/install.wxs	2017-09-28 10:23:20.000000000 +0000
@@ -10,7 +10,7 @@
 	<Product Id="*"
 	  Codepage="1252" Language="1033"
 	  Manufacturer="fio" Name="fio"
-	  UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="2.16">
+	  UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="3.1">
 		<Package
 		  Description="Flexible IO Tester"
 		  InstallerVersion="301" Keywords="Installer,MSI,Database"
@@ -58,7 +58,7 @@
 		<ComponentGroupRef Id="examples"/>
 	</Feature>
 
-	<Property Id="ARPURLINFOABOUT" Value="http://git.kernel.dk/?p=fio.git" />
+	<Property Id="ARPURLINFOABOUT" Value="http://git.kernel.dk/cgit/fio/" />
 	<Property Id='ARPCONTACT'>fio@vger.kernel.org</Property>
 	<Property Id='ARPHELPLINK'>http://www.spinics.net/lists/fio/</Property>
 	<Property Id='ARPURLUPDATEINFO'>http://bluestop.org/fio/</Property>
diff -Nru fio-2.16/os/windows/posix.c fio-3.1/os/windows/posix.c
--- fio-2.16/os/windows/posix.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/os/windows/posix.c	2017-09-28 10:23:20.000000000 +0000
@@ -25,8 +25,8 @@
 #include "../os-windows.h"
 #include "../../lib/hweight.h"
 
-extern unsigned long mtime_since_now(struct timeval *);
-extern void fio_gettime(struct timeval *, void *);
+extern unsigned long mtime_since_now(struct timespec *);
+extern void fio_gettime(struct timespec *, void *);
 
 /* These aren't defined in the MinGW headers */
 HRESULT WINAPI StringCchCopyA(
@@ -40,12 +40,6 @@
   const char *pszFormat,
   ...);
 
-int vsprintf_s(
-  char *buffer,
-  size_t numberOfElements,
-  const char *format,
-  va_list argptr);
-
 int win_to_posix_error(DWORD winerr)
 {
 	switch (winerr)
@@ -304,35 +298,76 @@
 		int fildes, off_t off)
 {
 	DWORD vaProt = 0;
+	DWORD mapAccess = 0;
+	DWORD lenlow;
+	DWORD lenhigh;
+	HANDLE hMap;
 	void* allocAddr = NULL;
 
 	if (prot & PROT_NONE)
 		vaProt |= PAGE_NOACCESS;
 
-	if ((prot & PROT_READ) && !(prot & PROT_WRITE))
+	if ((prot & PROT_READ) && !(prot & PROT_WRITE)) {
 		vaProt |= PAGE_READONLY;
+		mapAccess = FILE_MAP_READ;
+	}
 
-	if (prot & PROT_WRITE)
+	if (prot & PROT_WRITE) {
 		vaProt |= PAGE_READWRITE;
+		mapAccess |= FILE_MAP_WRITE;
+	}
+
+	lenlow = len & 0xFFFF;
+	lenhigh = len >> 16;
+	/* If the low DWORD is zero and the high DWORD is non-zero, `CreateFileMapping`
+	   will return ERROR_INVALID_PARAMETER. To avoid this, set both to zero. */
+	if (lenlow == 0) {
+		lenhigh = 0;
+	}
 
-	if ((flags & MAP_ANON) | (flags & MAP_ANONYMOUS))
+	if (flags & MAP_ANON || flags & MAP_ANONYMOUS)
 	{
 		allocAddr = VirtualAlloc(addr, len, MEM_COMMIT, vaProt);
 		if (allocAddr == NULL)
 			errno = win_to_posix_error(GetLastError());
 	}
+	else
+	{
+		hMap = CreateFileMapping((HANDLE)_get_osfhandle(fildes), NULL, vaProt, lenhigh, lenlow, NULL);
+
+		if (hMap != NULL)
+		{
+			allocAddr = MapViewOfFile(hMap, mapAccess, off >> 16, off & 0xFFFF, len);
+		}
+
+		if (hMap == NULL || allocAddr == NULL)
+			errno = win_to_posix_error(GetLastError());
+
+	}
 
 	return allocAddr;
 }
 
 int munmap(void *addr, size_t len)
 {
-	if (!VirtualFree(addr, 0, MEM_RELEASE)) {
-		errno = win_to_posix_error(GetLastError());
-		return -1;
+	BOOL success;
+
+	/* We may have allocated the memory with either MapViewOfFile or
+		 VirtualAlloc. Therefore, try calling UnmapViewOfFile first, and if that
+		 fails, call VirtualFree. */
+	success = UnmapViewOfFile(addr);
+
+	if (!success)
+	{
+		success = VirtualFree(addr, 0, MEM_RELEASE);
 	}
 
-	return 0;
+	return !success;
+}
+
+int msync(void *addr, size_t len, int flags)
+{
+	return !FlushViewOfFile(addr, len);
 }
 
 int fork(void)
@@ -549,7 +584,8 @@
 	while (path[i] != '\\' && path[i] != '/' && i >= 0)
 		i--;
 
-	strncpy(name, path + i + 1, MAX_PATH);
+	name[MAX_PATH - 1] = '\0';
+	strncpy(name, path + i + 1, MAX_PATH - 1);
 
 	return name;
 }
@@ -702,17 +738,9 @@
 
 int posix_madvise(void *addr, size_t len, int advice)
 {
-	log_err("%s is not implemented\n", __func__);
 	return ENOSYS;
 }
 
-/* Windows doesn't support advice for memory pages. Just ignore it. */
-int msync(void *addr, size_t len, int flags)
-{
-	errno = ENOSYS;
-	return -1;
-}
-
 int fdatasync(int fildes)
 {
 	return fsync(fildes);
@@ -825,7 +853,7 @@
 
 int nanosleep(const struct timespec *rqtp, struct timespec *rmtp)
 {
-	struct timeval tv;
+	struct timespec tv;
 	DWORD ms_remaining;
 	DWORD ms_total = (rqtp->tv_sec * 1000) + (rqtp->tv_nsec / 1000000.0);
 
diff -Nru fio-2.16/oslib/libmtd_common.h fio-3.1/oslib/libmtd_common.h
--- fio-2.16/oslib/libmtd_common.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/oslib/libmtd_common.h	2017-09-28 10:23:20.000000000 +0000
@@ -119,57 +119,6 @@
 	fprintf(stderr, "%s: warning!: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__); \
 } while(0)
 
-#if defined(__UCLIBC__)
-/* uClibc versions before 0.9.34 don't have rpmatch() */
-#if __UCLIBC_MAJOR__ == 0 && \
-		(__UCLIBC_MINOR__ < 9 || \
-		(__UCLIBC_MINOR__ == 9 && __UCLIBC_SUBLEVEL__ < 34))
-#undef rpmatch
-#define rpmatch __rpmatch
-static inline int __rpmatch(const char *resp)
-{
-    return (resp[0] == 'y' || resp[0] == 'Y') ? 1 :
-	(resp[0] == 'n' || resp[0] == 'N') ? 0 : -1;
-}
-#endif
-#endif
-
-/**
- * prompt the user for confirmation
- */
-static inline bool prompt(const char *msg, bool def)
-{
-	char *line = NULL;
-	size_t len;
-	bool ret = def;
-
-	do {
-		normsg_cont("%s (%c/%c) ", msg, def ? 'Y' : 'y', def ? 'n' : 'N');
-		fflush(stdout);
-
-		while (getline(&line, &len, stdin) == -1) {
-			printf("failed to read prompt; assuming '%s'\n",
-				def ? "yes" : "no");
-			break;
-		}
-
-		if (strcmp("\n", line) != 0) {
-			switch (rpmatch(line)) {
-			case 0: ret = false; break;
-			case 1: ret = true; break;
-			case -1:
-				puts("unknown response; please try again");
-				continue;
-			}
-		}
-		break;
-	} while (1);
-
-	free(line);
-
-	return ret;
-}
-
 static inline int is_power_of_2(unsigned long long n)
 {
 	return (n != 0 && ((n & (n - 1)) == 0));
diff -Nru fio-2.16/oslib/linux-dev-lookup.c fio-3.1/oslib/linux-dev-lookup.c
--- fio-2.16/oslib/linux-dev-lookup.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/oslib/linux-dev-lookup.c	2017-09-28 10:23:20.000000000 +0000
@@ -1,12 +1,12 @@
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/sysmacros.h>
 #include <dirent.h>
 #include <string.h>
 #include <stdio.h>
 #include <unistd.h>
 
-#include "../os/os.h"
-#include "oslib/linux-dev-lookup.h"
+#include "linux-dev-lookup.h"
 
 int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
 			   unsigned int min)
@@ -21,7 +21,7 @@
 		return 0;
 
 	while ((dir = readdir(D)) != NULL) {
-		char full_path[256];
+		char full_path[257];
 
 		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
 			continue;
diff -Nru fio-2.16/oslib/strlcat.c fio-3.1/oslib/strlcat.c
--- fio-2.16/oslib/strlcat.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/oslib/strlcat.c	2017-09-28 10:23:20.000000000 +0000
@@ -1,5 +1,5 @@
 #include <string.h>
-#include "oslib/strlcat.h"
+#include "strlcat.h"
 
 size_t strlcat(char *dst, const char *src, size_t size)
 {
diff -Nru fio-2.16/oslib/strndup.c fio-3.1/oslib/strndup.c
--- fio-2.16/oslib/strndup.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/oslib/strndup.c	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,18 @@
+#include <stdlib.h>
+#include "strndup.h"
+
+#ifndef CONFIG_HAVE_STRNDUP
+
+char *strndup(const char *s, size_t n)
+{
+	char *str = malloc(n + 1);
+
+	if (str) {
+		strncpy(str, s, n);
+		str[n] = '\0';
+	}
+
+	return str;
+}
+
+#endif
diff -Nru fio-2.16/oslib/strndup.h fio-3.1/oslib/strndup.h
--- fio-2.16/oslib/strndup.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/oslib/strndup.h	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,7 @@
+#include <string.h>
+
+#ifndef CONFIG_HAVE_STRNDUP
+
+char *strndup(const char *s, size_t n);
+
+#endif
diff -Nru fio-2.16/parse.c fio-3.1/parse.c
--- fio-2.16/parse.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/parse.c	2017-09-28 10:23:20.000000000 +0000
@@ -135,6 +135,7 @@
 	const char *p = str;
 	char *c;
 	unsigned long long mult = 1;
+	int i;
 
 	/*
          * Go forward until we hit a non-digit, or +/- sign
@@ -153,7 +154,7 @@
 	}
 
 	c = strdup(p);
-	for (int i = 0; i < strlen(c); i++)
+	for (i = 0; i < strlen(c); i++)
 		c[i] = tolower(c[i]);
 
 	if (!strncmp("us", c, 2) || !strncmp("usec", c, 4))
@@ -167,7 +168,7 @@
 	else if (!strcmp("h", c))
 		mult = 60 * 60 * 1000000UL;
 	else if (!strcmp("d", c))
-		mult = 24 * 60 * 60 * 1000000UL;
+		mult = 24 * 60 * 60 * 1000000ULL;
 
 	free(c);
 	return mult;
@@ -207,32 +208,50 @@
 		}
 	}
 
+	/* If kb_base is 1000, use true units.
+	 * If kb_base is 1024, use opposite units.
+	 */
 	if (!strncmp("pib", c, 3)) {
 		pow = 5;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("tib", c, 3)) {
 		pow = 4;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("gib", c, 3)) {
 		pow = 3;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("mib", c, 3)) {
 		pow = 2;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("kib", c, 3)) {
 		pow = 1;
-		mult = 1000;
-	} else if (!strncmp("p", c, 1) || !strncmp("pb", c, 2))
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
+	} else if (!strncmp("p", c, 1) || !strncmp("pb", c, 2)) {
 		pow = 5;
-	else if (!strncmp("t", c, 1) || !strncmp("tb", c, 2))
+	} else if (!strncmp("t", c, 1) || !strncmp("tb", c, 2)) {
 		pow = 4;
-	else if (!strncmp("g", c, 1) || !strncmp("gb", c, 2))
+	} else if (!strncmp("g", c, 1) || !strncmp("gb", c, 2)) {
 		pow = 3;
-	else if (!strncmp("m", c, 1) || !strncmp("mb", c, 2))
+	} else if (!strncmp("m", c, 1) || !strncmp("mb", c, 2)) {
 		pow = 2;
-	else if (!strncmp("k", c, 1) || !strncmp("kb", c, 2))
+	} else if (!strncmp("k", c, 1) || !strncmp("kb", c, 2)) {
 		pow = 1;
-	else if (!strncmp("%", c, 1)) {
+	} else if (!strncmp("%", c, 1)) {
 		*percent = 1;
 		free(c);
 		return ret;
@@ -1301,6 +1320,23 @@
 	}
 }
 
+void options_mem_dupe(struct fio_option *options, void *data)
+{
+	struct fio_option *o;
+	char **ptr;
+
+	dprint(FD_PARSE, "dup options\n");
+
+	for (o = &options[0]; o->name; o++) {
+		if (o->type != FIO_OPT_STR_STORE)
+			continue;
+
+		ptr = td_var(data, o, o->off1);
+		if (*ptr)
+			*ptr = strdup(*ptr);
+	}
+}
+
 void options_free(struct fio_option *options, void *data)
 {
 	struct fio_option *o;
@@ -1309,7 +1345,7 @@
 	dprint(FD_PARSE, "free options\n");
 
 	for (o = &options[0]; o->name; o++) {
-		if (o->type != FIO_OPT_STR_STORE || !o->off1)
+		if (o->type != FIO_OPT_STR_STORE || !o->off1 || o->no_free)
 			continue;
 
 		ptr = td_var(data, o, o->off1);
diff -Nru fio-2.16/parse.h fio-3.1/parse.h
--- fio-2.16/parse.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/parse.h	2017-09-28 10:23:20.000000000 +0000
@@ -78,6 +78,7 @@
 	int is_time;			/* time based value */
 	int no_warn_def;
 	int pow2;			/* must be a power-of-2 */
+	int no_free;
 };
 
 extern int parse_option(char *, const char *, struct fio_option *, struct fio_option **, void *, struct flist_head *);
@@ -86,6 +87,7 @@
 extern int show_cmd_help(struct fio_option *, const char *);
 extern void fill_default_options(void *, struct fio_option *);
 extern void options_init(struct fio_option *);
+extern void options_mem_dupe(struct fio_option *, void *);
 extern void options_free(struct fio_option *, void *);
 
 extern void strip_blank_front(char **);
@@ -106,8 +108,7 @@
 typedef int (fio_opt_int_fn)(void *, int *);
 
 struct thread_options;
-static inline void *td_var(struct thread_options *to, struct fio_option *o,
-			   unsigned int offset)
+static inline void *td_var(void *to, struct fio_option *o, unsigned int offset)
 {
 	void *ret;
 
diff -Nru fio-2.16/printing.c fio-3.1/printing.c
--- fio-2.16/printing.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/printing.c	2017-09-28 10:23:20.000000000 +0000
@@ -31,7 +31,7 @@
 			      gpointer data)
 {
 	cairo_t *cr;
-	char str[20];
+	char str[32];
 	double x, y;
 
 	cr = gtk_print_context_get_cairo_context(context);
diff -Nru fio-2.16/profiles/act.c fio-3.1/profiles/act.c
--- fio-2.16/profiles/act.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/profiles/act.c	2017-09-28 10:23:20.000000000 +0000
@@ -47,20 +47,12 @@
 static struct act_run_data *act_run_data;
 
 struct act_prof_data {
-	struct timeval sample_tv;
+	struct timespec sample_tv;
 	struct act_slice *slices;
 	unsigned int cur_slice;
 	unsigned int nr_slices;
 };
 
-static char *device_names;
-static unsigned int load;
-static unsigned int prep;
-static unsigned int threads_per_queue;
-static unsigned int num_read_blocks;
-static unsigned int write_size;
-static unsigned long long test_duration;
-
 #define ACT_MAX_OPTS	128
 static const char *act_opts[ACT_MAX_OPTS] = {
 	"direct=1",
@@ -97,6 +89,7 @@
 		.help	= "Devices to use",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_ACT,
+		.no_free = true,
 	},
 	{
 		.name	= "load",
@@ -130,21 +123,21 @@
 	},
 	{
 		.name	= "read-req-num-512-blocks",
-		.lname	= "Number of 512b blocks to read",
+		.lname	= "Number of 512B blocks to read",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct act_options, num_read_blocks),
-		.help	= "Number of 512b blocks to read at the time",
+		.help	= "Number of 512B blocks to read at the time",
 		.def	= "3",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_ACT,
 	},
 	{
 		.name	= "large-block-op-kbytes",
-		.lname	= "Size of large block ops (writes)",
+		.lname	= "Size of large block ops in KiB (writes)",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct act_options, write_size),
-		.help	= "Size of large block ops (writes)",
-		.def	= "128k",
+		.help	= "Size of large block ops in KiB (writes)",
+		.def	= "131072",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_ACT,
 	},
@@ -185,6 +178,8 @@
 
 static int act_add_rw(const char *dev, int reads)
 {
+	struct act_options *ao = &act_options;
+
 	if (act_add_opt("name=act-%s-%s", reads ? "read" : "write", dev))
 		return 1;
 	if (act_add_opt("filename=%s", dev))
@@ -192,21 +187,21 @@
 	if (act_add_opt("rw=%s", reads ? "randread" : "randwrite"))
 		return 1;
 	if (reads) {
-		int rload = load * R_LOAD / threads_per_queue;
+		int rload = ao->load * R_LOAD / ao->threads_per_queue;
 
-		if (act_add_opt("numjobs=%u", threads_per_queue))
+		if (act_add_opt("numjobs=%u", ao->threads_per_queue))
 			return 1;
 		if (act_add_opt("rate_iops=%u", rload))
 			return 1;
-		if (act_add_opt("bs=%u", num_read_blocks * 512))
+		if (act_add_opt("bs=%u", ao->num_read_blocks * 512))
 			return 1;
 	} else {
-		const int rsize = write_size / (num_read_blocks * 512);
-		int wload = (load * W_LOAD + rsize - 1) / rsize;
+		const int rsize = ao->write_size / (ao->num_read_blocks * 512);
+		int wload = (ao->load * W_LOAD + rsize - 1) / rsize;
 
 		if (act_add_opt("rate_iops=%u", wload))
 			return 1;
-		if (act_add_opt("bs=%u", write_size))
+		if (act_add_opt("bs=%u", ao->write_size))
 			return 1;
 	}
 
@@ -220,7 +215,7 @@
 		return 1;
 	if (act_add_opt("filename=%s", dev))
 		return 1;
-	if (act_add_opt("bs=1M"))
+	if (act_add_opt("bs=1048576"))
 		return 1;
 	if (act_add_opt("zero_buffers"))
 		return 1;
@@ -234,7 +229,7 @@
 		return 1;
 	if (act_add_opt("filename=%s", dev))
 		return 1;
-	if (act_add_opt("bs=4k"))
+	if (act_add_opt("bs=4096"))
 		return 1;
 	if (act_add_opt("ioengine=libaio"))
 		return 1;
@@ -248,10 +243,10 @@
 
 static int act_add_dev(const char *dev)
 {
-	if (prep)
+	if (act_options.prep)
 		return act_add_dev_prep(dev);
 
-	if (act_add_opt("runtime=%llus", test_duration))
+	if (act_add_opt("runtime=%llus", act_options.test_duration))
 		return 1;
 	if (act_add_opt("time_based=1"))
 		return 1;
@@ -269,7 +264,7 @@
  */
 static int act_prep_cmdline(void)
 {
-	if (!device_names) {
+	if (!act_options.device_names) {
 		log_err("act: you need to set IO target(s) with the "
 			"device-names option.\n");
 		return 1;
@@ -280,7 +275,7 @@
 	do {
 		char *dev;
 
-		dev = strsep(&device_names, ",");
+		dev = strsep(&act_options.device_names, ",");
 		if (!dev)
 			break;
 
@@ -300,7 +295,7 @@
 	int i, ret = 0;
 	double perm;
 
-	if (prep)
+	if (act_options.prep)
 		return 0;
 
 	/*
@@ -431,7 +426,7 @@
 	get_act_ref();
 
 	apd = calloc(1, sizeof(*apd));
-	nr_slices = (test_duration + SAMPLE_SEC - 1) / SAMPLE_SEC;
+	nr_slices = (act_options.test_duration + SAMPLE_SEC - 1) / SAMPLE_SEC;
 	apd->slices = calloc(nr_slices, sizeof(struct act_slice));
 	apd->nr_slices = nr_slices;
 	fio_gettime(&apd->sample_tv, NULL);
diff -Nru fio-2.16/profiles/tiobench.c fio-3.1/profiles/tiobench.c
--- fio-2.16/profiles/tiobench.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/profiles/tiobench.c	2017-09-28 10:23:20.000000000 +0000
@@ -39,7 +39,7 @@
 		.lname	= "Tiobench size",
 		.type	= FIO_OPT_STR_VAL,
 		.off1	= offsetof(struct tiobench_options, size),
-		.help	= "Size in MB",
+		.help	= "Size in MiB",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_TIOBENCH,
 	},
@@ -49,7 +49,7 @@
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct tiobench_options, bs),
 		.help	= "Block size in bytes",
-		.def	= "4k",
+		.def	= "4096",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_TIOBENCH,
 	},
@@ -70,6 +70,7 @@
 		.help	= "Test directory",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_TIOBENCH,
+		.no_free = true,
 	},
 	{
 		.name	= "threads",
@@ -91,7 +92,7 @@
 static int tb_prep_cmdline(void)
 {
 	/*
-	 * tiobench uses size as MB, so multiply up
+	 * tiobench uses size as MiB, so multiply up
 	 */
 	size *= 1024 * 1024ULL;
 	if (size)
diff -Nru fio-2.16/rate-submit.c fio-3.1/rate-submit.c
--- fio-2.16/rate-submit.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/rate-submit.c	2017-09-28 10:23:20.000000000 +0000
@@ -5,7 +5,7 @@
  *
  */
 #include "fio.h"
-#include "ioengine.h"
+#include "ioengines.h"
 #include "lib/getrusage.h"
 #include "rate-submit.h"
 
@@ -98,7 +98,6 @@
 {
 	struct thread_data *parent = sw->wq->td;
 	struct thread_data *td = sw->priv;
-	int fio_unused ret;
 
 	memcpy(&td->o, &parent->o, sizeof(td->o));
 	memcpy(&td->ts, &parent->ts, sizeof(td->ts));
diff -Nru fio-2.16/README fio-3.1/README
--- fio-2.16/README	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/README	2017-09-28 10:23:20.000000000 +0000
@@ -1,18 +1,31 @@
-fio
----
+Overview and history
+--------------------
 
-fio is a tool that will spawn a number of threads or processes doing a
-particular type of io action as specified by the user. fio takes a
-number of global parameters, each inherited by the thread unless
-otherwise parameters given to them overriding that setting is given.
-The typical use of fio is to write a job file matching the io load
-one wants to simulate.
+Fio was originally written to save me the hassle of writing special test case
+programs when I wanted to test a specific workload, either for performance
+reasons or to find/reproduce a bug. The process of writing such a test app can
+be tiresome, especially if you have to do it often.  Hence I needed a tool that
+would be able to simulate a given I/O workload without resorting to writing a
+tailored test case again and again.
+
+A test work load is difficult to define, though. There can be any number of
+processes or threads involved, and they can each be using their own way of
+generating I/O. You could have someone dirtying large amounts of memory in an
+memory mapped file, or maybe several threads issuing reads using asynchronous
+I/O. fio needed to be flexible enough to simulate both of these cases, and many
+more.
+
+Fio spawns a number of threads or processes doing a particular type of I/O
+action as specified by the user. fio takes a number of global parameters, each
+inherited by the thread unless otherwise parameters given to them overriding
+that setting is given.  The typical use of fio is to write a job file matching
+the I/O load one wants to simulate.
 
 
 Source
 ------
 
-fio resides in a git repo, the canonical place is:
+Fio resides in a git repo, the canonical place is:
 
 	git://git.kernel.dk/fio.git
 
@@ -21,63 +34,37 @@
 
 	http://git.kernel.dk/fio.git
 
-Snapshots are frequently generated and include the git meta data as well.
+Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
+meta data as well. Other tarballs are archives of official fio releases.
 Snapshots can download from:
 
 	http://brick.kernel.dk/snaps/
 
-There are also two official mirrors. Both of these are automatically synced
-with the main repository, when changes are pushed. If the main repo is down
-for some reason, either one of these is safe to use as a backup:
+There are also two official mirrors. Both of these are automatically synced with
+the main repository, when changes are pushed. If the main repo is down for some
+reason, either one of these is safe to use as a backup:
 
 	git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
+
 	https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
 
 or
 
 	git://github.com/axboe/fio.git
-	https://github.com/axboe/fio.git
-
-
-Binary packages
----------------
-
-Debian:
-Starting with Debian "Squeeze", fio packages are part of the official
-Debian repository. http://packages.debian.org/search?keywords=fio
-
-Ubuntu:
-Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
-of the Ubuntu "universe" repository.
-http://packages.ubuntu.com/search?keywords=fio
-
-Red Hat, CentOS & Co:
-Dag Wieërs has RPMs for Red Hat related distros, find them here:
-http://dag.wieers.com/rpm/packages/fio/
 
-Mandriva:
-Mandriva has integrated fio into their package repository, so installing
-on that distro should be as easy as typing 'urpmi fio'.
-
-Solaris:
-Packages for Solaris are available from OpenCSW. Install their pkgutil
-tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
-'pkgutil -i fio'.
-
-Windows:
-Bruce Cran <bruce@cran.org.uk> has fio packages for Windows at
-http://www.bluestop.org/fio/ .
+	https://github.com/axboe/fio.git
 
 
 Mailing list
 ------------
 
 The fio project mailing list is meant for anything related to fio including
-general discussion, bug reporting, questions, and development.
+general discussion, bug reporting, questions, and development. For bug reporting,
+see REPORTING-BUGS.
 
-An automated mail detailing recent commits is automatically sent to the
-list at most daily. The list address is fio@vger.kernel.org, subscribe
-by sending an email to majordomo@vger.kernel.org with
+An automated mail detailing recent commits is automatically sent to the list at
+most daily. The list address is fio@vger.kernel.org, subscribe by sending an
+email to majordomo@vger.kernel.org with
 
 	subscribe fio
 
@@ -90,260 +77,126 @@
 	http://maillist.kernel.dk/fio-devel/
 
 
-Building
---------
-
-Just type 'configure', 'make' and 'make install'.
-
-Note that GNU make is required. On BSD it's available from devel/gmake;
-on Solaris it's in the SUNWgmake package. On platforms where GNU make
-isn't the default, type 'gmake' instead of 'make'.
-
-Configure will print the enabled options. Note that on Linux based
-platforms, the libaio development packages must be installed to use
-the libaio engine. Depending on distro, it is usually called
-libaio-devel or libaio-dev.
-
-For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
-to be installed.  gfio isn't built automatically and can be enabled
-with a --enable-gfio option to configure.
-
-To build FIO with a cross-compiler:
- $ make clean
- $ make CROSS_COMPILE=/path/to/toolchain/prefix
-Configure will attempt to determine the target platform automatically.
-
-It's possible to build fio for ESX as well, use the --esx switch to
-configure.
+Author
+------
 
+Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
+the Linux I/O subsystem and schedulers. He got tired of writing specific test
+applications to simulate a given workload, and found that the existing I/O
+benchmark/test tools out there weren't flexible enough to do what he wanted.
 
-Windows
--------
+Jens Axboe <axboe@kernel.dk> 20060905
 
-On Windows, Cygwin (http://www.cygwin.com/) is required in order to
-build fio. To create an MSI installer package install WiX 3.8 from
-http://wixtoolset.org and run dobuild.cmd from the
-os/windows directory.
 
-How to compile fio on 64-bit Windows:
+Binary packages
+---------------
 
- 1. Install Cygwin (http://www.cygwin.com/). Install 'make' and all
-    packages starting with 'mingw64-i686' and 'mingw64-x86_64'.
- 2. Open the Cygwin Terminal.
- 3. Go to the fio directory (source files).
- 4. Run 'make clean && make -j'.
+Debian:
+	Starting with Debian "Squeeze", fio packages are part of the official
+	Debian repository. http://packages.debian.org/search?keywords=fio .
 
-To build fio on 32-bit Windows, run './configure --build-32bit-win' before 'make'.
+Ubuntu:
+	Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
+	of the Ubuntu "universe" repository.
+	http://packages.ubuntu.com/search?keywords=fio .
+
+Red Hat, Fedora, CentOS & Co:
+	Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
+	packages are part of the Fedora/EPEL repositories.
+	https://apps.fedoraproject.org/packages/fio .
 
-It's recommended that once built or installed, fio be run in a Command Prompt
-or other 'native' console such as console2, since there are known to be display
-and signal issues when running it under a Cygwin shell
-(see http://code.google.com/p/mintty/issues/detail?id=56 for details).
+Mandriva:
+	Mandriva has integrated fio into their package repository, so installing
+	on that distro should be as easy as typing ``urpmi fio``.
 
+Arch Linux:
+        An Arch Linux package is provided under the Community sub-repository:
+        https://www.archlinux.org/packages/?sort=&q=fio
 
-Command line
-------------
+Solaris:
+	Packages for Solaris are available from OpenCSW. Install their pkgutil
+	tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
+	``pkgutil -i fio``.
 
-$ fio
-	--debug			Enable some debugging options (see below)
-	--parse-only		Parse options only, don't start any IO
-	--output		Write output to file
-	--runtime		Runtime in seconds
-	--bandwidth-log		Generate aggregate bandwidth logs
-	--minimal		Minimal (terse) output
-	--output-format=type	Output format (terse,json,json+,normal)
-	--terse-version=type	Terse version output format (default 3, or 2 or 4).
-	--version		Print version info and exit
-	--help			Print this page
-	--cpuclock-test		Perform test/validation of CPU clock
-	--crctest[=test]	Test speed of checksum functions
-	--cmdhelp=cmd		Print command help, "all" for all of them
-	--enghelp=engine	Print ioengine help, or list available ioengines
-	--enghelp=engine,cmd	Print help for an ioengine cmd
-	--showcmd		Turn a job file into command line options
-	--readonly		Turn on safety read-only checks, preventing
-				writes
-	--eta=when		When ETA estimate should be printed
-				May be "always", "never" or "auto"
-	--eta-newline=time	Force a new line for every 'time' period passed
-	--status-interval=t	Force full status dump every 't' period passed
-	--section=name		Only run specified section in job file.
-				Multiple sections can be specified.
-	--alloc-size=kb		Set smalloc pool to this size in kb (def 16384)
-	--warnings-fatal	Fio parser warnings are fatal
-	--max-jobs		Maximum number of threads/processes to support
-	--server=args		Start backend server. See Client/Server section.
-	--client=host		Connect to specified backend(s).
-	--remote-config=file	Tell fio server to load this local file
-	--idle-prof=option	Report cpu idleness on a system or percpu basis
-				(option=system,percpu) or run unit work
-				calibration only (option=calibrate).
-	--inflate-log=log	Inflate and output compressed log
-	--trigger-file=file	Execute trigger cmd when file exists
-	--trigger-timeout=t	Execute trigger af this time
-	--trigger=cmd		Set this command as local trigger
-	--trigger-remote=cmd	Set this command as remote trigger
-	--aux-path=path		Use this path for fio state generated files
-
-
-Any parameters following the options will be assumed to be job files,
-unless they match a job file parameter. Multiple job files can be listed 
-and each job file will be regarded as a separate group. fio will stonewall
-execution between each group.
-
-The --readonly option is an extra safety guard to prevent users from
-accidentally starting a write workload when that is not desired.  Fio
-will only write if rw=write/randwrite/rw/randrw is given.  This extra
-safety net can be used as an extra precaution as --readonly will also
-enable a write check in the io engine core to prevent writes due to
-unknown user space bug(s).
-
-The --debug option triggers additional logging by fio.
-Currently, additional logging is available for:
-
-	process		Dump info related to processes
-	file		Dump info related to file actions
-	io		Dump info related to IO queuing
-	mem		Dump info related to memory allocations
-	blktrace	Dump info related to blktrace setup
-	verify		Dump info related to IO verification
-	all		Enable all debug options
-	random		Dump info related to random offset generation
-	parse		Dump info related to option matching and parsing
-	diskutil	Dump info related to disk utilization updates
-	job:x		Dump info only related to job number x
-	mutex		Dump info only related to mutex up/down ops
-	profile		Dump info related to profile extensions
-	time		Dump info related to internal time keeping
-	net		Dump info related to networking connections
-	rate		Dump info related to IO rate switching
-	compress	Dump info related to log compress/decompress
-	? or help	Show available debug options.
-
-One can specify multiple debug options: e.g. --debug=file,mem will enable
-file and memory debugging.
-
-The --section option allows one to combine related jobs into one file.
-E.g. one job file could define light, moderate, and heavy sections. Tell fio to
-run only the "heavy" section by giving --section=heavy command line option.
-One can also specify the "write" operations in one section and "verify"
-operation in another section.  The --section option only applies to job
-sections.  The reserved 'global' section is always parsed and used.
-
-The --alloc-size switch allows one to use a larger pool size for smalloc.
-If running large jobs with randommap enabled, fio can run out of memory.
-Smalloc is an internal allocator for shared structures from a fixed size
-memory pool. The pool size defaults to 16M and can grow to 8 pools.
+Windows:
+	Rebecca Cran <rebecca+fio@bluestop.org> has fio packages for Windows at
+	http://www.bluestop.org/fio/ .
 
-NOTE: While running .fio_smalloc.* backing store files are visible in /tmp.
+BSDs:
+	Packages for BSDs may be available from their binary package repositories.
+	Look for a package "fio" using their binary package managers.
 
 
-Job file
+Building
 --------
 
-See the HOWTO file for a complete description of job file syntax and
-parameters.  The --cmdhelp option also lists all options. If used with
-an option argument, --cmdhelp will detail the given option.  The job file
-format is in the ini style format, as that is easy for the user to review
-and modify.
-
-This README contains the terse version. Job files can describe big and
-complex setups that are not possible with the command line.  Job files
-are a good practice even for simple jobs since the file provides an
-easily accessed record of the workload and can include comments.
-
-See the examples/ directory for inspiration on how to write job files.  Note
-the copyright and license requirements currently apply to examples/ files.
-
-
-Client/server
-------------
-
-Normally fio is invoked as a stand-alone application on the machine
-where the IO workload should be generated. However, the frontend and
-backend of fio can be run separately. Ie the fio server can generate
-an IO workload on the "Device Under Test" while being controlled from
-another machine.
-
-Start the server on the machine which has access to the storage DUT:
-
-fio --server=args
-
-where args defines what fio listens to. The arguments are of the form
-'type,hostname or IP,port'. 'type' is either 'ip' (or ip4) for TCP/IP v4,
-'ip6' for TCP/IP v6, or 'sock' for a local unix domain socket.
-'hostname' is either a hostname or IP address, and 'port' is the port to
-listen to (only valid for TCP/IP, not a local socket). Some examples:
-
-1) fio --server
-
-   Start a fio server, listening on all interfaces on the default port (8765).
-
-2) fio --server=ip:hostname,4444
-
-   Start a fio server, listening on IP belonging to hostname and on port 4444.
-
-3) fio --server=ip6:::1,4444
-
-   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
-
-4) fio --server=,4444
+Just type::
 
-   Start a fio server, listening on all interfaces on port 4444.
+ $ ./configure
+ $ make
+ $ make install
+
+Note that GNU make is required. On BSDs it's available from devel/gmake within
+ports directory; on Solaris it's in the SUNWgmake package.  On platforms where
+GNU make isn't the default, type ``gmake`` instead of ``make``.
+
+Configure will print the enabled options. Note that on Linux based platforms,
+the libaio development packages must be installed to use the libaio
+engine. Depending on distro, it is usually called libaio-devel or libaio-dev.
 
-5) fio --server=1.2.3.4
-
-   Start a fio server, listening on IP 1.2.3.4 on the default port.
-
-6) fio --server=sock:/tmp/fio.sock
-
-   Start a fio server, listening on the local socket /tmp/fio.sock.
-
-Once a server is running, a "client" can connect to the fio server with:
-
-fio --local-args --client=<server> --remote-args <job file(s)>
-
-where --local-args are arguments for the client where it is
-running, 'server' is the connect string, and --remote-args and <job file(s)>
-are sent to the server. The 'server' string follows the same format as it
-does on the server side, to allow IP/hostname/socket and port strings.
+For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
+to be installed.  gfio isn't built automatically and can be enabled with a
+``--enable-gfio`` option to configure.
 
-Fio can connect to multiple servers this way:
+To build fio with a cross-compiler::
 
-fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
+ $ make clean
+ $ make CROSS_COMPILE=/path/to/toolchain/prefix
 
-If the job file is located on the fio server, then you can tell the server
-to load a local file as well. This is done by using --remote-config:
+Configure will attempt to determine the target platform automatically.
 
-fio --client=server --remote-config /path/to/file.fio
+It's possible to build fio for ESX as well, use the ``--esx`` switch to
+configure.
 
-Then fio will open this local (to the server) job file instead
-of being passed one from the client.
 
-If you have many servers (example: 100 VMs/containers), 
-you can input a pathname of a file containing host IPs/names as the parameter 
-value for the --client option.  For example, here is an example "host.list" 
-file containing 2 hostnames:
+Windows
+~~~~~~~
 
-host1.your.dns.domain
-host2.your.dns.domain
+On Windows, Cygwin (http://www.cygwin.com/) is required in order to build
+fio. To create an MSI installer package install WiX 3.8 from
+http://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
+directory.
 
-The fio command would then be:
+How to compile fio on 64-bit Windows:
 
-fio --client=host.list <job file(s)>
+ 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
+    packages starting with **mingw64-i686** and **mingw64-x86_64**.
+ 2. Open the Cygwin Terminal.
+ 3. Go to the fio directory (source files).
+ 4. Run ``make clean && make -j``.
 
-In this mode, you cannot input server-specific parameters or job files -- all
-servers receive the same job file.  
+To build fio on 32-bit Windows, run ``./configure --build-32bit-win`` before
+``make``.
 
-In order to let fio --client runs use a shared filesystem 
-from multiple hosts, fio  --client now prepends the IP address of the 
-server to the filename.  For example, if fio is using directory /mnt/nfs/fio 
-and is writing filename fileio.tmp, with a --client hostfile containing 
-two hostnames h1 and h2 with IP addresses 192.168.10.120 and  192.168.10.121,
-then fio will create two files:
+It's recommended that once built or installed, fio be run in a Command Prompt or
+other 'native' console such as console2, since there are known to be display and
+signal issues when running it under a Cygwin shell (see
+https://github.com/mintty/mintty/issues/56 and
+https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs
+for details).
+
+
+Documentation
+~~~~~~~~~~~~~
+
+Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
+To build HTML formatted documentation run ``make -C doc html`` and direct your
+browser to :file:`./doc/output/html/index.html`.  To build manual page run
+``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
+output formats are supported run ``make -C doc help``.
 
-	/mnt/nfs/fio/192.168.10.120.fileio.tmp
-	/mnt/nfs/fio/192.168.10.121.fileio.tmp
+.. _reStructuredText: http://www.sphinx-doc.org/rest.html
+.. _Sphinx: http://www.sphinx-doc.org
 
 
 Platforms
@@ -351,32 +204,31 @@
 
 Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
 Windows, FreeBSD, and DragonFly. Some features and/or options may only be
-available on some of the platforms, typically because those features only
-apply to that platform (like the solarisaio engine, or the splice engine on
-Linux).
+available on some of the platforms, typically because those features only apply
+to that platform (like the solarisaio engine, or the splice engine on Linux).
 
 Some features are not available on FreeBSD/Solaris even if they could be
-implemented, I'd be happy to take patches for that. An example of that is
-disk utility statistics and (I think) huge page support, support for that
-does exist in FreeBSD/Solaris.
-
-Fio uses pthread mutexes for signalling and locking and FreeBSD does not
-support process shared pthread mutexes. As a result, only threads are
-supported on FreeBSD. This could be fixed with sysv ipc locking or
-other locking alternatives.
-
-Other *BSD platforms are untested, but fio should work there almost out
-of the box. Since I don't do test runs or even compiles on those platforms,
-your mileage may vary. Sending me patches for other platforms is greatly
+implemented, I'd be happy to take patches for that. An example of that is disk
+utility statistics and (I think) huge page support, support for that does exist
+in FreeBSD/Solaris.
+
+Fio uses pthread mutexes for signalling and locking and some platforms do not
+support process shared pthread mutexes. As a result, on such platforms only
+threads are supported. This could be fixed with sysv ipc locking or other
+locking alternatives.
+
+Other \*BSD platforms are untested, but fio should work there almost out of the
+box. Since I don't do test runs or even compiles on those platforms, your
+mileage may vary. Sending me patches for other platforms is greatly
 appreciated. There's a lot of value in having the same test/benchmark tool
 available on all platforms.
 
-Note that POSIX aio is not enabled by default on AIX. Messages like these:
+Note that POSIX aio is not enabled by default on AIX. Messages like these::
 
     Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because:
         Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix.
 
-indicate one needs to enable POSIX aio. Run the following commands as root:
+indicate one needs to enable POSIX aio. Run the following commands as root::
 
     # lsdev -C -l posix_aio0
         posix_aio0 Defined  Posix Asynchronous I/O
@@ -384,20 +236,41 @@
     # lsdev -C -l posix_aio0
         posix_aio0 Available  Posix Asynchronous I/O
 
-POSIX aio should work now. To make the change permanent:
+POSIX aio should work now. To make the change permanent::
 
     # chdev -l posix_aio0 -P -a autoconfig='available'
         posix_aio0 changed
 
 
-Author
-------
+Running fio
+-----------
 
-Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing
-of the Linux IO subsystem and schedulers. He got tired of writing
-specific test applications to simulate a given workload, and found that
-the existing io benchmark/test tools out there weren't flexible enough
-to do what he wanted.
+Running fio is normally the easiest part - you just give it the job file
+(or job files) as parameters::
 
-Jens Axboe <axboe@kernel.dk> 20060905
+	$ fio [options] [jobfile] ...
+
+and it will start doing what the *jobfile* tells it to do. You can give more
+than one job file on the command line, fio will serialize the running of those
+files. Internally that is the same as using the :option:`stonewall` parameter
+described in the parameter section.
+
+If the job file contains only one job, you may as well just give the parameters
+on the command line. The command line parameters are identical to the job
+parameters, with a few extra that control global parameters.  For example, for
+the job file parameter :option:`iodepth=2 <iodepth>`, the mirror command line
+option would be :option:`--iodepth 2 <iodepth>` or :option:`--iodepth=2
+<iodepth>`. You can also use the command line for giving more than one job
+entry. For each :option:`--name <name>` option that fio sees, it will start a
+new job with that name.  Command line entries following a
+:option:`--name <name>` entry will apply to that job, until there are no more
+entries or a new :option:`--name <name>` entry is seen. This is similar to the
+job file options, where each option applies to the current job until a new []
+job entry is seen.
+
+fio does not need to run as root, except if the files or devices specified in
+the job section requires that. Some other options may also be restricted, such
+as memory locking, I/O scheduler switching, and decreasing the nice value.
 
+If *jobfile* is specified as ``-``, the job file will be read from standard
+input.
diff -Nru fio-2.16/server.c fio-3.1/server.c
--- fio-2.16/server.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/server.c	2017-09-28 10:23:20.000000000 +0000
@@ -50,17 +50,6 @@
 	struct flist_head next;	/* Other sk_entry's, if linked command */
 };
 
-struct sk_out {
-	unsigned int refs;	/* frees sk_out when it drops to zero.
-				 * protected by below ->lock */
-
-	int sk;			/* socket fd to talk to client */
-	struct fio_mutex lock;	/* protects ref and below list */
-	struct flist_head list;	/* list of pending transmit work */
-	struct fio_mutex wait;	/* wake backend when items added to list */
-	struct fio_mutex xmit;	/* held while sending data */
-};
-
 static char *fio_server_arg;
 static char *bind_sock;
 static struct sockaddr_in saddr_in;
@@ -263,9 +252,10 @@
 	return fio_sendv_data(sk, &iov, 1);
 }
 
-static int fio_recv_data(int sk, void *p, unsigned int len, bool wait)
+static int fio_recv_data(int sk, void *buf, unsigned int len, bool wait)
 {
 	int flags;
+	char *p = buf;
 
 	if (wait)
 		flags = MSG_WAITALL;
@@ -388,7 +378,7 @@
 			break;
 
 		/* There's payload, get it */
-		pdu = (void *) cmdret->payload + pdu_offset;
+		pdu = (char *) cmdret->payload + pdu_offset;
 		ret = fio_recv_data(sk, pdu, cmd.pdu_len, wait);
 		if (ret)
 			break;
@@ -449,7 +439,7 @@
 
 	reply = calloc(1, sizeof(*reply));
 	INIT_FLIST_HEAD(&reply->list);
-	fio_gettime(&reply->tv, NULL);
+	fio_gettime(&reply->ts, NULL);
 	reply->saved_tag = tag;
 	reply->opcode = opcode;
 
@@ -866,7 +856,7 @@
 #ifdef CONFIG_BIG_ENDIAN
 	probe.bigendian = 1;
 #endif
-	strncpy((char *) probe.fio_version, fio_version_string, sizeof(probe.fio_version));
+	strncpy((char *) probe.fio_version, fio_version_string, sizeof(probe.fio_version) - 1);
 
 	probe.os	= FIO_OS;
 	probe.arch	= FIO_ARCH;
@@ -980,6 +970,7 @@
 	} else
 		fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, rep, sz, NULL, SK_F_FREE | SK_F_INLINE);
 
+	fio_terminate_threads(TERMINATE_ALL);
 	exec_trigger(buf);
 	return 0;
 }
@@ -1290,7 +1281,7 @@
 
 	ret = getsockname(sk, sockaddr_p, &len);
 	if (ret) {
-		log_err("fio: getsockaddr: %s\n", strerror(errno));
+		log_err("fio: getsockname: %s\n", strerror(errno));
 		return -1;
 	}
 
@@ -1444,7 +1435,7 @@
 		dst->min_run[i]		= cpu_to_le64(src->min_run[i]);
 		dst->max_bw[i]		= cpu_to_le64(src->max_bw[i]);
 		dst->min_bw[i]		= cpu_to_le64(src->min_bw[i]);
-		dst->io_kb[i]		= cpu_to_le64(src->io_kb[i]);
+		dst->iobytes[i]		= cpu_to_le64(src->iobytes[i]);
 		dst->agg[i]		= cpu_to_le64(src->agg[i]);
 	}
 
@@ -1485,6 +1476,7 @@
 		convert_io_stat(&p.ts.slat_stat[i], &ts->slat_stat[i]);
 		convert_io_stat(&p.ts.lat_stat[i], &ts->lat_stat[i]);
 		convert_io_stat(&p.ts.bw_stat[i], &ts->bw_stat[i]);
+		convert_io_stat(&p.ts.iops_stat[i], &ts->iops_stat[i]);
 	}
 
 	p.ts.usr_time		= cpu_to_le64(ts->usr_time);
@@ -1492,7 +1484,8 @@
 	p.ts.ctx		= cpu_to_le64(ts->ctx);
 	p.ts.minf		= cpu_to_le64(ts->minf);
 	p.ts.majf		= cpu_to_le64(ts->majf);
-	p.ts.clat_percentiles	= cpu_to_le64(ts->clat_percentiles);
+	p.ts.clat_percentiles	= cpu_to_le32(ts->clat_percentiles);
+	p.ts.lat_percentiles	= cpu_to_le32(ts->lat_percentiles);
 	p.ts.percentile_precision = cpu_to_le64(ts->percentile_precision);
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
@@ -1508,10 +1501,12 @@
 		p.ts.io_u_complete[i]	= cpu_to_le32(ts->io_u_complete[i]);
 	}
 
-	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		p.ts.io_u_lat_n[i]	= cpu_to_le32(ts->io_u_lat_n[i]);
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
 		p.ts.io_u_lat_u[i]	= cpu_to_le32(ts->io_u_lat_u[i]);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
 		p.ts.io_u_lat_m[i]	= cpu_to_le32(ts->io_u_lat_m[i]);
-	}
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++)
 		for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
@@ -2279,7 +2274,7 @@
  * For local domain sockets:
  *	*ptr is the filename, *is_sock is 1.
  */
-int fio_server_parse_string(const char *str, char **ptr, int *is_sock,
+int fio_server_parse_string(const char *str, char **ptr, bool *is_sock,
 			    int *port, struct in_addr *inp,
 			    struct in6_addr *inp6, int *ipv6)
 {
@@ -2288,13 +2283,13 @@
 	int lport = 0;
 
 	*ptr = NULL;
-	*is_sock = 0;
+	*is_sock = false;
 	*port = fio_net_port;
 	*ipv6 = 0;
 
 	if (!strncmp(str, "sock:", 5)) {
 		*ptr = strdup(str + 5);
-		*is_sock = 1;
+		*is_sock = true;
 
 		return 0;
 	}
@@ -2373,7 +2368,8 @@
 static int fio_handle_server_arg(void)
 {
 	int port = fio_net_port;
-	int is_sock, ret = 0;
+	bool is_sock;
+	int ret = 0;
 
 	saddr_in.sin_addr.s_addr = htonl(INADDR_ANY);
 
@@ -2538,7 +2534,7 @@
 
 	pid = fork();
 	if (pid < 0) {
-		log_err("fio: failed server fork: %s", strerror(errno));
+		log_err("fio: failed server fork: %s\n", strerror(errno));
 		free(pidfile);
 		return -1;
 	} else if (pid) {
diff -Nru fio-2.16/server.h fio-3.1/server.h
--- fio-2.16/server.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/server.h	2017-09-28 10:23:20.000000000 +0000
@@ -12,6 +12,17 @@
 
 #define FIO_NET_PORT 8765
 
+struct sk_out {
+	unsigned int refs;	/* frees sk_out when it drops to zero.
+				 * protected by below ->lock */
+
+	int sk;			/* socket fd to talk to client */
+	struct fio_mutex lock;	/* protects ref and below list */
+	struct flist_head list;	/* list of pending transmit work */
+	struct fio_mutex wait;	/* wake backend when items added to list */
+	struct fio_mutex xmit;	/* held while sending data */
+};
+
 /*
  * On-wire encoding is little endian
  */
@@ -32,13 +43,13 @@
 
 struct fio_net_cmd_reply {
 	struct flist_head list;
-	struct timeval tv;
+	struct timespec ts;
 	uint64_t saved_tag;
 	uint16_t opcode;
 };
 
 enum {
-	FIO_SERVER_VER			= 60,
+	FIO_SERVER_VER			= 66,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 	FIO_SERVER_MAX_CMD_MB		= 2048,
@@ -201,7 +212,7 @@
 extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *);
 extern int fio_net_send_simple_cmd(int, uint16_t, uint64_t, struct flist_head *);
 extern void fio_server_set_arg(const char *);
-extern int fio_server_parse_string(const char *, char **, int *, int *, struct in_addr *, struct in6_addr *, int *);
+extern int fio_server_parse_string(const char *, char **, bool *, int *, struct in_addr *, struct in6_addr *, int *);
 extern int fio_server_parse_host(const char *, int, struct in_addr *, struct in6_addr *);
 extern const char *fio_server_op(unsigned int);
 extern void fio_server_got_signal(int);
diff -Nru fio-2.16/smalloc.c fio-3.1/smalloc.c
--- fio-2.16/smalloc.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/smalloc.c	2017-09-28 10:23:20.000000000 +0000
@@ -13,6 +13,7 @@
 #include <limits.h>
 #include <fcntl.h>
 
+#include "fio.h"
 #include "mutex.h"
 #include "arch/arch.h"
 #include "os/os.h"
@@ -188,7 +189,7 @@
 		goto out_fail;
 
 	pool->map = ptr;
-	pool->bitmap = (void *) ptr + (pool->nr_blocks * SMALLOC_BPL);
+	pool->bitmap = (unsigned int *)((char *) ptr + (pool->nr_blocks * SMALLOC_BPL));
 	memset(pool->bitmap, 0, bitmap_blocks * sizeof(unsigned int));
 
 	pool->lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
@@ -248,7 +249,7 @@
 	uintptr_t ptr;
 
 	ptr = (uintptr_t) hdr + hdr->size - sizeof(unsigned int);
-	ptr = (ptr + int_mask) & ~int_mask;
+	ptr = (uintptr_t) PTR_ALIGN(ptr, int_mask);
 
 	return (void *) ptr;
 }
diff -Nru fio-2.16/stat.c fio-3.1/stat.c
--- fio-2.16/stat.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/stat.c	2017-09-28 10:23:20.000000000 +0000
@@ -37,9 +37,9 @@
 	struct thread_stat *ts = &td->ts;
 
 	fio_getrusage(&td->ru_end);
-	ts->usr_time += mtime_since(&td->ru_start.ru_utime,
+	ts->usr_time += mtime_since_tv(&td->ru_start.ru_utime,
 					&td->ru_end.ru_utime);
-	ts->sys_time += mtime_since(&td->ru_start.ru_stime,
+	ts->sys_time += mtime_since_tv(&td->ru_start.ru_stime,
 					&td->ru_end.ru_stime);
 	ts->ctx += td->ru_end.ru_nvcsw + td->ru_end.ru_nivcsw
 			- (td->ru_start.ru_nvcsw + td->ru_start.ru_nivcsw);
@@ -58,7 +58,7 @@
  * group by looking at the index bits.
  *
  */
-static unsigned int plat_val_to_idx(unsigned int val)
+static unsigned int plat_val_to_idx(unsigned long long val)
 {
 	unsigned int msb, error_bits, base, offset, idx;
 
@@ -66,7 +66,7 @@
 	if (val == 0)
 		msb = 0;
 	else
-		msb = (sizeof(val)*8) - __builtin_clz(val) - 1;
+		msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
 
 	/*
 	 * MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
@@ -98,9 +98,10 @@
  * Convert the given index of the bucket array to the value
  * represented by the bucket
  */
-static unsigned int plat_idx_to_val(unsigned int idx)
+static unsigned long long plat_idx_to_val(unsigned int idx)
 {
-	unsigned int error_bits, k, base;
+	unsigned int error_bits;
+	unsigned long long k, base;
 
 	assert(idx < FIO_IO_U_PLAT_NR);
 
@@ -111,7 +112,7 @@
 
 	/* Find the group and compute the minimum value of that group */
 	error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1;
-	base = 1 << (error_bits + FIO_IO_U_PLAT_BITS);
+	base = ((unsigned long long) 1) << (error_bits + FIO_IO_U_PLAT_BITS);
 
 	/* Find its bucket number of the group */
 	k = idx % FIO_IO_U_PLAT_VAL;
@@ -135,16 +136,16 @@
 }
 
 unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
-				   fio_fp64_t *plist, unsigned int **output,
-				   unsigned int *maxv, unsigned int *minv)
+				   fio_fp64_t *plist, unsigned long long **output,
+				   unsigned long long *maxv, unsigned long long *minv)
 {
 	unsigned long sum = 0;
 	unsigned int len, i, j = 0;
 	unsigned int oval_len = 0;
-	unsigned int *ovals = NULL;
-	int is_last;
+	unsigned long long *ovals = NULL;
+	bool is_last;
 
-	*minv = -1U;
+	*minv = -1ULL;
 	*maxv = 0;
 
 	len = 0;
@@ -165,7 +166,7 @@
 	/*
 	 * Calculate bucket values, note down max and min values
 	 */
-	is_last = 0;
+	is_last = false;
 	for (i = 0; i < FIO_IO_U_PLAT_NR && !is_last; i++) {
 		sum += io_u_plat[i];
 		while (sum >= (plist[j].u.f / 100.0 * nr)) {
@@ -173,7 +174,7 @@
 
 			if (j == oval_len) {
 				oval_len += 100;
-				ovals = realloc(ovals, oval_len * sizeof(unsigned int));
+				ovals = realloc(ovals, oval_len * sizeof(*ovals));
 			}
 
 			ovals[j] = plat_idx_to_val(i);
@@ -182,7 +183,7 @@
 			if (ovals[j] > *maxv)
 				*maxv = ovals[j];
 
-			is_last = (j == len - 1);
+			is_last = (j == len - 1) != 0;
 			if (is_last)
 				break;
 
@@ -199,11 +200,14 @@
  */
 static void show_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
 				  fio_fp64_t *plist, unsigned int precision,
-				  struct buf_output *out)
+				  bool is_clat, struct buf_output *out)
 {
-	unsigned int len, j = 0, minv, maxv;
-	unsigned int *ovals;
-	int is_last, per_line, scale_down;
+	unsigned int divisor, len, i, j = 0;
+	unsigned long long minv, maxv;
+	unsigned long long *ovals;
+	int per_line, scale_down, time_width;
+	const char *pre = is_clat ? "clat" : " lat";
+	bool is_last;
 	char fmt[32];
 
 	len = calc_clat_percentiles(io_u_plat, nr, plist, &ovals, &maxv, &minv);
@@ -211,39 +215,42 @@
 		goto out;
 
 	/*
-	 * We default to usecs, but if the value range is such that we
-	 * should scale down to msecs, do that.
+	 * We default to nsecs, but if the value range is such that we
+	 * should scale down to usecs or msecs, do that.
 	 */
-	if (minv > 2000 && maxv > 99999) {
+	if (minv > 2000000 && maxv > 99999999ULL) {
+		scale_down = 2;
+		divisor = 1000000;
+		log_buf(out, "    %s percentiles (msec):\n     |", pre);
+	} else if (minv > 2000 && maxv > 99999) {
 		scale_down = 1;
-		log_buf(out, "    clat percentiles (msec):\n     |");
+		divisor = 1000;
+		log_buf(out, "    %s percentiles (usec):\n     |", pre);
 	} else {
 		scale_down = 0;
-		log_buf(out, "    clat percentiles (usec):\n     |");
+		divisor = 1;
+		log_buf(out, "    %s percentiles (nsec):\n     |", pre);
 	}
 
-	snprintf(fmt, sizeof(fmt), "%%1.%uf", precision);
-	per_line = (80 - 7) / (precision + 14);
 
-	for (j = 0; j < len; j++) {
-		char fbuf[16], *ptr = fbuf;
+	time_width = max(5, (int) (log10(maxv / divisor) + 1));
+	snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
+			precision, time_width);
+	/* fmt will be something like " %5.2fth=[%4llu]%c" */
+	per_line = (80 - 7) / (precision + 10 + time_width);
 
+	for (j = 0; j < len; j++) {
 		/* for formatting */
 		if (j != 0 && (j % per_line) == 0)
 			log_buf(out, "     |");
 
 		/* end of the list */
-		is_last = (j == len - 1);
-
-		if (plist[j].u.f < 10.0)
-			ptr += sprintf(fbuf, " ");
+		is_last = (j == len - 1) != 0;
 
-		snprintf(ptr, sizeof(fbuf), fmt, plist[j].u.f);
-
-		if (scale_down)
+		for (i = 0; i < scale_down; i++)
 			ovals[j] = (ovals[j] + 999) / 1000;
 
-		log_buf(out, " %sth=[%5u]%c", fbuf, ovals[j], is_last ? '\n' : ',');
+		log_buf(out, fmt, plist[j].u.f, ovals[j], is_last ? '\n' : ',');
 
 		if (is_last)
 			break;
@@ -257,8 +264,8 @@
 		free(ovals);
 }
 
-bool calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max,
-	      double *mean, double *dev)
+bool calc_lat(struct io_stat *is, unsigned long long *min,
+	      unsigned long long *max, double *mean, double *dev)
 {
 	double n = (double) is->samples;
 
@@ -279,7 +286,8 @@
 
 void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
 {
-	char *p1, *p2, *p3, *p4;
+	char *io, *agg, *min, *max;
+	char *ioalt, *aggalt, *minalt, *maxalt;
 	const char *str[] = { "   READ", "  WRITE" , "   TRIM"};
 	int i;
 
@@ -291,22 +299,28 @@
 		if (!rs->max_run[i])
 			continue;
 
-		p1 = num2str(rs->io_kb[i], 6, rs->kb_base, i2p, 8);
-		p2 = num2str(rs->agg[i], 6, rs->kb_base, i2p, rs->unit_base);
-		p3 = num2str(rs->min_bw[i], 6, rs->kb_base, i2p, rs->unit_base);
-		p4 = num2str(rs->max_bw[i], 6, rs->kb_base, i2p, rs->unit_base);
-
-		log_buf(out, "%s: io=%s, aggrb=%s/s, minb=%s/s, maxb=%s/s,"
-			 " mint=%llumsec, maxt=%llumsec\n",
+		io = num2str(rs->iobytes[i], 4, 1, i2p, N2S_BYTE);
+		ioalt = num2str(rs->iobytes[i], 4, 1, !i2p, N2S_BYTE);
+		agg = num2str(rs->agg[i], 4, 1, i2p, rs->unit_base);
+		aggalt = num2str(rs->agg[i], 4, 1, !i2p, rs->unit_base);
+		min = num2str(rs->min_bw[i], 4, 1, i2p, rs->unit_base);
+		minalt = num2str(rs->min_bw[i], 4, 1, !i2p, rs->unit_base);
+		max = num2str(rs->max_bw[i], 4, 1, i2p, rs->unit_base);
+		maxalt = num2str(rs->max_bw[i], 4, 1, !i2p, rs->unit_base);
+		log_buf(out, "%s: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n",
 				rs->unified_rw_rep ? "  MIXED" : str[i],
-				p1, p2, p3, p4,
+				agg, aggalt, min, max, minalt, maxalt, io, ioalt,
 				(unsigned long long) rs->min_run[i],
 				(unsigned long long) rs->max_run[i]);
 
-		free(p1);
-		free(p2);
-		free(p3);
-		free(p4);
+		free(io);
+		free(agg);
+		free(min);
+		free(max);
+		free(ioalt);
+		free(aggalt);
+		free(minalt);
+		free(maxalt);
 	}
 }
 
@@ -348,6 +362,28 @@
 	}
 }
 
+/*
+ * To keep the terse format unaltered, add all of the ns latency
+ * buckets to the first us latency bucket
+ */
+void stat_calc_lat_nu(struct thread_stat *ts, double *io_u_lat_u)
+{
+	unsigned long ntotal = 0, total = ddir_rw_sum(ts->total_io_u);
+	int i;
+
+	stat_calc_lat(ts, io_u_lat_u, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR);
+
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		ntotal += ts->io_u_lat_n[i];
+
+	io_u_lat_u[0] += 100.0 * (double) ntotal / (double) total;
+}
+
+void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat)
+{
+	stat_calc_lat(ts, io_u_lat, ts->io_u_lat_n, FIO_IO_U_LAT_N_NR);
+}
+
 void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat)
 {
 	stat_calc_lat(ts, io_u_lat, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR);
@@ -358,17 +394,20 @@
 	stat_calc_lat(ts, io_u_lat, ts->io_u_lat_m, FIO_IO_U_LAT_M_NR);
 }
 
-static void display_lat(const char *name, unsigned long min, unsigned long max,
-			double mean, double dev, struct buf_output *out)
+static void display_lat(const char *name, unsigned long long min,
+			unsigned long long max, double mean, double dev,
+			struct buf_output *out)
 {
-	const char *base = "(usec)";
+	const char *base = "(nsec)";
 	char *minp, *maxp;
 
-	if (usec_to_msec(&min, &max, &mean, &dev))
+	if (nsec_to_msec(&min, &max, &mean, &dev))
 		base = "(msec)";
+	else if (nsec_to_usec(&min, &max, &mean, &dev))
+		base = "(usec)";
 
-	minp = num2str(min, 6, 1, 0, 0);
-	maxp = num2str(max, 6, 1, 0, 0);
+	minp = num2str(min, 6, 1, 0, N2S_NONE);
+	maxp = num2str(max, 6, 1, 0, N2S_NONE);
 
 	log_buf(out, "    %s %s: min=%s, max=%s, avg=%5.02f,"
 		 " stdev=%5.02f\n", name, base, minp, maxp, mean, dev);
@@ -380,11 +419,11 @@
 static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
 			     int ddir, struct buf_output *out)
 {
-	const char *str[] = { "read ", "write", "trim" };
-	unsigned long min, max, runt;
-	unsigned long long bw, iops;
+	const char *str[] = { " read", "write", " trim" };
+	unsigned long runt;
+	unsigned long long min, max, bw, iops;
 	double mean, dev;
-	char *io_p, *bw_p, *iops_p;
+	char *io_p, *bw_p, *bw_p_alt, *iops_p;
 	int i2p;
 
 	assert(ddir_rw(ddir));
@@ -396,19 +435,21 @@
 	runt = ts->runtime[ddir];
 
 	bw = (1000 * ts->io_bytes[ddir]) / runt;
-	io_p = num2str(ts->io_bytes[ddir], 6, 1, i2p, 8);
-	bw_p = num2str(bw, 6, 1, i2p, ts->unit_base);
+	io_p = num2str(ts->io_bytes[ddir], 4, 1, i2p, N2S_BYTE);
+	bw_p = num2str(bw, 4, 1, i2p, ts->unit_base);
+	bw_p_alt = num2str(bw, 4, 1, !i2p, ts->unit_base);
 
 	iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt;
-	iops_p = num2str(iops, 6, 1, 0, 0);
+	iops_p = num2str(iops, 4, 1, 0, N2S_NONE);
 
-	log_buf(out, "  %s: io=%s, bw=%s/s, iops=%s, runt=%6llumsec\n",
-				rs->unified_rw_rep ? "mixed" : str[ddir],
-				io_p, bw_p, iops_p,
-				(unsigned long long) ts->runtime[ddir]);
+	log_buf(out, "  %s: IOPS=%s, BW=%s (%s)(%s/%llumsec)\n",
+			rs->unified_rw_rep ? "mixed" : str[ddir],
+			iops_p, bw_p, bw_p_alt, io_p,
+			(unsigned long long) ts->runtime[ddir]);
 
 	free(io_p);
 	free(bw_p);
+	free(bw_p_alt);
 	free(iops_p);
 
 	if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev))
@@ -418,15 +459,31 @@
 	if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
 		display_lat(" lat", min, max, mean, dev, out);
 
-	if (ts->clat_percentiles) {
+	if (ts->clat_percentiles || ts->lat_percentiles) {
 		show_clat_percentiles(ts->io_u_plat[ddir],
 					ts->clat_stat[ddir].samples,
 					ts->percentile_list,
-					ts->percentile_precision, out);
+					ts->percentile_precision,
+					ts->clat_percentiles, out);
 	}
 	if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
 		double p_of_agg = 100.0, fkb_base = (double)rs->kb_base;
-		const char *bw_str = (rs->unit_base == 1 ? "Kbit" : "KB");
+		const char *bw_str;
+
+		if ((rs->unit_base == 1) && i2p)
+			bw_str = "Kibit";
+		else if (rs->unit_base == 1)
+			bw_str = "kbit";
+		else if (i2p)
+			bw_str = "KiB";
+		else
+			bw_str = "kB";
+
+		if (rs->agg[ddir]) {
+			p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
+			if (p_of_agg > 100.0)
+				p_of_agg = 100.0;
+		}
 
 		if (rs->unit_base == 1) {
 			min *= 8.0;
@@ -435,40 +492,41 @@
 			dev *= 8.0;
 		}
 
-		if (rs->agg[ddir]) {
-			p_of_agg = mean * 100 / (double) rs->agg[ddir];
-			if (p_of_agg > 100.0)
-				p_of_agg = 100.0;
-		}
-
 		if (mean > fkb_base * fkb_base) {
 			min /= fkb_base;
 			max /= fkb_base;
 			mean /= fkb_base;
 			dev /= fkb_base;
-			bw_str = (rs->unit_base == 1 ? "Mbit" : "MB");
+			bw_str = (rs->unit_base == 1 ? "Mibit" : "MiB");
 		}
 
-		log_buf(out, "    bw (%-4s/s): min=%5lu, max=%5lu, per=%3.2f%%,"
-			 " avg=%5.02f, stdev=%5.02f\n", bw_str, min, max,
-							p_of_agg, mean, dev);
+		log_buf(out, "   bw (%5s/s): min=%5llu, max=%5llu, per=%3.2f%%, "
+			"avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
+			bw_str, min, max, p_of_agg, mean, dev,
+			(&ts->bw_stat[ddir])->samples);
+	}
+	if (calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) {
+		log_buf(out, "   iops        : min=%5llu, max=%5llu, "
+			"avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
+			min, max, mean, dev, (&ts->iops_stat[ddir])->samples);
 	}
 }
 
-static int show_lat(double *io_u_lat, int nr, const char **ranges,
-		    const char *msg, struct buf_output *out)
+static bool show_lat(double *io_u_lat, int nr, const char **ranges,
+		     const char *msg, struct buf_output *out)
 {
-	int new_line = 1, i, line = 0, shown = 0;
+	bool new_line = true, shown = false;
+	int i, line = 0;
 
 	for (i = 0; i < nr; i++) {
 		if (io_u_lat[i] <= 0.0)
 			continue;
-		shown = 1;
+		shown = true;
 		if (new_line) {
 			if (line)
 				log_buf(out, "\n");
-			log_buf(out, "    lat (%s) : ", msg);
-			new_line = 0;
+			log_buf(out, "  lat (%s)   : ", msg);
+			new_line = false;
 			line = 0;
 		}
 		if (line)
@@ -476,13 +534,21 @@
 		log_buf(out, "%s%3.2f%%", ranges[i], io_u_lat[i]);
 		line++;
 		if (line == 5)
-			new_line = 1;
+			new_line = true;
 	}
 
 	if (shown)
 		log_buf(out, "\n");
 
-	return shown;
+	return true;
+}
+
+static void show_lat_n(double *io_u_lat_n, struct buf_output *out)
+{
+	const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=",
+				 "250=", "500=", "750=", "1000=", };
+
+	show_lat(io_u_lat_n, FIO_IO_U_LAT_N_NR, ranges, "nsec", out);
 }
 
 static void show_lat_u(double *io_u_lat_u, struct buf_output *out)
@@ -504,12 +570,15 @@
 
 static void show_latencies(struct thread_stat *ts, struct buf_output *out)
 {
+	double io_u_lat_n[FIO_IO_U_LAT_N_NR];
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
 
+	stat_calc_lat_n(ts, io_u_lat_n);
 	stat_calc_lat_u(ts, io_u_lat_u);
 	stat_calc_lat_m(ts, io_u_lat_m);
 
+	show_lat_n(io_u_lat_n, out);
 	show_lat_u(io_u_lat_u, out);
 	show_lat_m(io_u_lat_m, out);
 }
@@ -659,7 +728,7 @@
 
 static void show_ss_normal(struct thread_stat *ts, struct buf_output *out)
 {
-	char *p1, *p2;
+	char *p1, *p1alt, *p2;
 	unsigned long long bw_mean, iops_mean;
 	const int i2p = is_power_of_2(ts->kb_base);
 
@@ -669,18 +738,20 @@
 	bw_mean = steadystate_bw_mean(ts);
 	iops_mean = steadystate_iops_mean(ts);
 
-	p1 = num2str(bw_mean / ts->kb_base, 6, ts->kb_base, i2p, ts->unit_base);
-	p2 = num2str(iops_mean, 6, 1, 0, 0);
+	p1 = num2str(bw_mean / ts->kb_base, 4, ts->kb_base, i2p, ts->unit_base);
+	p1alt = num2str(bw_mean / ts->kb_base, 4, ts->kb_base, !i2p, ts->unit_base);
+	p2 = num2str(iops_mean, 4, 1, 0, N2S_NONE);
 
-	log_buf(out, "  steadystate  : attained=%s, bw=%s/s, iops=%s, %s%s=%.3f%s\n",
+	log_buf(out, "  steadystate  : attained=%s, bw=%s (%s), iops=%s, %s%s=%.3f%s\n",
 		ts->ss_state & __FIO_SS_ATTAINED ? "yes" : "no",
-		p1, p2,
+		p1, p1alt, p2,
 		ts->ss_state & __FIO_SS_IOPS ? "iops" : "bw",
 		ts->ss_state & __FIO_SS_SLOPE ? " slope": " mean dev",
 		ts->ss_criterion.u.f,
 		ts->ss_state & __FIO_SS_PCT ? "%" : "");
 
 	free(p1);
+	free(p1alt);
 	free(p2);
 }
 
@@ -761,9 +832,9 @@
 					io_u_dist[1], io_u_dist[2],
 					io_u_dist[3], io_u_dist[4],
 					io_u_dist[5], io_u_dist[6]);
-	log_buf(out, "     issued    : total=r=%llu/w=%llu/d=%llu,"
-				 " short=r=%llu/w=%llu/d=%llu,"
-				 " drop=r=%llu/w=%llu/d=%llu\n",
+	log_buf(out, "     issued rwt: total=%llu,%llu,%llu,"
+				 " short=%llu,%llu,%llu,"
+				 " dropped=%llu,%llu,%llu\n",
 					(unsigned long long) ts->total_io_u[0],
 					(unsigned long long) ts->total_io_u[1],
 					(unsigned long long) ts->total_io_u[2],
@@ -797,14 +868,13 @@
 
 static void show_ddir_status_terse(struct thread_stat *ts,
 				   struct group_run_stats *rs, int ddir,
-				   struct buf_output *out)
+				   int ver, struct buf_output *out)
 {
-	unsigned long min, max;
-	unsigned long long bw, iops;
-	unsigned int *ovals = NULL;
+	unsigned long long min, max, minv, maxv, bw, iops;
+	unsigned long long *ovals = NULL;
 	double mean, dev;
-	unsigned int len, minv, maxv;
-	int i;
+	unsigned int len;
+	int i, bw_stat;
 
 	assert(ddir_rw(ddir));
 
@@ -812,7 +882,7 @@
 	if (ts->runtime[ddir]) {
 		uint64_t runt = ts->runtime[ddir];
 
-		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024;
+		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024; /* KiB/s */
 		iops = (1000 * (uint64_t) ts->total_io_u[ddir]) / runt;
 	}
 
@@ -821,16 +891,16 @@
 					(unsigned long long) ts->runtime[ddir]);
 
 	if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev))
-		log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev);
+		log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
 	else
-		log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+		log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
 
 	if (calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev))
-		log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev);
+		log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
 	else
-		log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+		log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
 
-	if (ts->clat_percentiles) {
+	if (ts->clat_percentiles || ts->lat_percentiles) {
 		len = calc_clat_percentiles(ts->io_u_plat[ddir],
 					ts->clat_stat[ddir].samples,
 					ts->percentile_list, &ovals, &maxv,
@@ -843,39 +913,53 @@
 			log_buf(out, ";0%%=0");
 			continue;
 		}
-		log_buf(out, ";%f%%=%u", ts->percentile_list[i].u.f, ovals[i]);
+		log_buf(out, ";%f%%=%llu", ts->percentile_list[i].u.f, ovals[i]/1000);
 	}
 
 	if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
-		log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev);
+		log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
 	else
-		log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+		log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
 
 	if (ovals)
 		free(ovals);
 
-	if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
+	bw_stat = calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev);
+	if (bw_stat) {
 		double p_of_agg = 100.0;
 
 		if (rs->agg[ddir]) {
-			p_of_agg = mean * 100 / (double) rs->agg[ddir];
+			p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
 			if (p_of_agg > 100.0)
 				p_of_agg = 100.0;
 		}
 
-		log_buf(out, ";%lu;%lu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
+		log_buf(out, ";%llu;%llu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
 	} else
-		log_buf(out, ";%lu;%lu;%f%%;%f;%f", 0UL, 0UL, 0.0, 0.0, 0.0);
+		log_buf(out, ";%llu;%llu;%f%%;%f;%f", 0ULL, 0ULL, 0.0, 0.0, 0.0);
+
+	if (ver == 5) {
+		if (bw_stat)
+			log_buf(out, ";%" PRIu64, (&ts->bw_stat[ddir])->samples);
+		else
+			log_buf(out, ";%lu", 0UL);
+
+		if (calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev))
+			log_buf(out, ";%llu;%llu;%f;%f;%" PRIu64, min, max,
+				mean, dev, (&ts->iops_stat[ddir])->samples);
+		else
+			log_buf(out, ";%llu;%llu;%f;%f;%lu", 0ULL, 0ULL, 0.0, 0.0, 0UL);
+	}
 }
 
 static void add_ddir_status_json(struct thread_stat *ts,
 		struct group_run_stats *rs, int ddir, struct json_object *parent)
 {
-	unsigned long min, max;
+	unsigned long long min, max, minv, maxv;
 	unsigned long long bw;
-	unsigned int *ovals = NULL;
+	unsigned long long *ovals = NULL;
 	double mean, dev, iops;
-	unsigned int len, minv, maxv;
+	unsigned int len;
 	int i;
 	const char *ddirname[] = {"read", "write", "trim"};
 	struct json_object *dir_object, *tmp_object, *percentile_object, *clat_bins_object;
@@ -896,11 +980,12 @@
 	if (ts->runtime[ddir]) {
 		uint64_t runt = ts->runtime[ddir];
 
-		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024;
+		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024; /* KiB/s */
 		iops = (1000.0 * (uint64_t) ts->total_io_u[ddir]) / runt;
 	}
 
-	json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir] >> 10);
+	json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir]);
+	json_object_add_value_int(dir_object, "io_kbytes", ts->io_bytes[ddir] >> 10);
 	json_object_add_value_int(dir_object, "bw", bw);
 	json_object_add_value_float(dir_object, "iops", iops);
 	json_object_add_value_int(dir_object, "runtime", ts->runtime[ddir]);
@@ -913,7 +998,7 @@
 		mean = dev = 0.0;
 	}
 	tmp_object = json_create_object();
-	json_object_add_value_object(dir_object, "slat", tmp_object);
+	json_object_add_value_object(dir_object, "slat_ns", tmp_object);
 	json_object_add_value_int(tmp_object, "min", min);
 	json_object_add_value_int(tmp_object, "max", max);
 	json_object_add_value_float(tmp_object, "mean", mean);
@@ -924,13 +1009,13 @@
 		mean = dev = 0.0;
 	}
 	tmp_object = json_create_object();
-	json_object_add_value_object(dir_object, "clat", tmp_object);
+	json_object_add_value_object(dir_object, "clat_ns", tmp_object);
 	json_object_add_value_int(tmp_object, "min", min);
 	json_object_add_value_int(tmp_object, "max", max);
 	json_object_add_value_float(tmp_object, "mean", mean);
 	json_object_add_value_float(tmp_object, "stddev", dev);
 
-	if (ts->clat_percentiles) {
+	if (ts->clat_percentiles || ts->lat_percentiles) {
 		len = calc_clat_percentiles(ts->io_u_plat[ddir],
 					ts->clat_stat[ddir].samples,
 					ts->percentile_list, &ovals, &maxv,
@@ -953,12 +1038,11 @@
 		clat_bins_object = json_create_object();
 		json_object_add_value_object(tmp_object, "bins", clat_bins_object);
 		for(i = 0; i < FIO_IO_U_PLAT_NR; i++) {
-			snprintf(buf, sizeof(buf), "%d", i);
-			json_object_add_value_int(clat_bins_object, (const char *)buf, ts->io_u_plat[ddir][i]);
+			if (ts->io_u_plat[ddir][i]) {
+				snprintf(buf, sizeof(buf), "%llu", plat_idx_to_val(i));
+				json_object_add_value_int(clat_bins_object, (const char *)buf, ts->io_u_plat[ddir][i]);
+			}
 		}
-		json_object_add_value_int(clat_bins_object, "FIO_IO_U_PLAT_BITS", FIO_IO_U_PLAT_BITS);
-		json_object_add_value_int(clat_bins_object, "FIO_IO_U_PLAT_VAL", FIO_IO_U_PLAT_VAL);
-		json_object_add_value_int(clat_bins_object, "FIO_IO_U_PLAT_NR", FIO_IO_U_PLAT_NR);
 	}
 
 	if (!calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev)) {
@@ -966,7 +1050,7 @@
 		mean = dev = 0.0;
 	}
 	tmp_object = json_create_object();
-	json_object_add_value_object(dir_object, "lat", tmp_object);
+	json_object_add_value_object(dir_object, "lat_ns", tmp_object);
 	json_object_add_value_int(tmp_object, "min", min);
 	json_object_add_value_int(tmp_object, "max", max);
 	json_object_add_value_float(tmp_object, "mean", mean);
@@ -976,7 +1060,7 @@
 
 	if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
 		if (rs->agg[ddir]) {
-			p_of_agg = mean * 100 / (double) rs->agg[ddir];
+			p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
 			if (p_of_agg > 100.0)
 				p_of_agg = 100.0;
 		}
@@ -989,74 +1073,24 @@
 	json_object_add_value_float(dir_object, "bw_agg", p_of_agg);
 	json_object_add_value_float(dir_object, "bw_mean", mean);
 	json_object_add_value_float(dir_object, "bw_dev", dev);
-}
-
-static void show_thread_status_terse_v2(struct thread_stat *ts,
-					struct group_run_stats *rs,
-					struct buf_output *out)
-{
-	double io_u_dist[FIO_IO_U_MAP_NR];
-	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
-	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
-	double usr_cpu, sys_cpu;
-	int i;
-
-	/* General Info */
-	log_buf(out, "2;%s;%d;%d", ts->name, ts->groupid, ts->error);
-	/* Log Read Status */
-	show_ddir_status_terse(ts, rs, DDIR_READ, out);
-	/* Log Write Status */
-	show_ddir_status_terse(ts, rs, DDIR_WRITE, out);
-	/* Log Trim Status */
-	show_ddir_status_terse(ts, rs, DDIR_TRIM, out);
+	json_object_add_value_int(dir_object, "bw_samples",
+				(&ts->bw_stat[ddir])->samples);
 
-	/* CPU Usage */
-	if (ts->total_run_time) {
-		double runt = (double) ts->total_run_time;
-
-		usr_cpu = (double) ts->usr_time * 100 / runt;
-		sys_cpu = (double) ts->sys_time * 100 / runt;
-	} else {
-		usr_cpu = 0;
-		sys_cpu = 0;
+	if (!calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) {
+		min = max = 0;
+		mean = dev = 0.0;
 	}
-
-	log_buf(out, ";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu,
-						(unsigned long long) ts->ctx,
-						(unsigned long long) ts->majf,
-						(unsigned long long) ts->minf);
-
-	/* Calc % distribution of IO depths, usecond, msecond latency */
-	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
-	stat_calc_lat_u(ts, io_u_lat_u);
-	stat_calc_lat_m(ts, io_u_lat_m);
-
-	/* Only show fixed 7 I/O depth levels*/
-	log_buf(out, ";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%",
-			io_u_dist[0], io_u_dist[1], io_u_dist[2], io_u_dist[3],
-			io_u_dist[4], io_u_dist[5], io_u_dist[6]);
-
-	/* Microsecond latency */
-	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
-		log_buf(out, ";%3.2f%%", io_u_lat_u[i]);
-	/* Millisecond latency */
-	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
-		log_buf(out, ";%3.2f%%", io_u_lat_m[i]);
-	/* Additional output if continue_on_error set - default off*/
-	if (ts->continue_on_error)
-		log_buf(out, ";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
-	log_buf(out, "\n");
-
-	/* Additional output if description is set */
-	if (strlen(ts->description))
-		log_buf(out, ";%s", ts->description);
-
-	log_buf(out, "\n");
+	json_object_add_value_int(dir_object, "iops_min", min);
+	json_object_add_value_int(dir_object, "iops_max", max);
+	json_object_add_value_float(dir_object, "iops_mean", mean);
+	json_object_add_value_float(dir_object, "iops_stddev", dev);
+	json_object_add_value_int(dir_object, "iops_samples",
+				(&ts->iops_stat[ddir])->samples);
 }
 
-static void show_thread_status_terse_v3_v4(struct thread_stat *ts,
-					   struct group_run_stats *rs, int ver,
-					   struct buf_output *out)
+static void show_thread_status_terse_all(struct thread_stat *ts,
+					 struct group_run_stats *rs, int ver,
+					 struct buf_output *out)
 {
 	double io_u_dist[FIO_IO_U_MAP_NR];
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
@@ -1065,15 +1099,19 @@
 	int i;
 
 	/* General Info */
-	log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string,
-					ts->name, ts->groupid, ts->error);
+	if (ver == 2)
+		log_buf(out, "2;%s;%d;%d", ts->name, ts->groupid, ts->error);
+	else
+		log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string,
+			ts->name, ts->groupid, ts->error);
+
 	/* Log Read Status */
-	show_ddir_status_terse(ts, rs, DDIR_READ, out);
+	show_ddir_status_terse(ts, rs, DDIR_READ, ver, out);
 	/* Log Write Status */
-	show_ddir_status_terse(ts, rs, DDIR_WRITE, out);
+	show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out);
 	/* Log Trim Status */
-	if (ver == 4)
-		show_ddir_status_terse(ts, rs, DDIR_TRIM, out);
+	if (ver == 2 || ver == 4 || ver == 5)
+		show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out);
 
 	/* CPU Usage */
 	if (ts->total_run_time) {
@@ -1093,7 +1131,7 @@
 
 	/* Calc % distribution of IO depths, usecond, msecond latency */
 	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
-	stat_calc_lat_u(ts, io_u_lat_u);
+	stat_calc_lat_nu(ts, io_u_lat_u);
 	stat_calc_lat_m(ts, io_u_lat_m);
 
 	/* Only show fixed 7 I/O depth levels*/
@@ -1109,11 +1147,14 @@
 		log_buf(out, ";%3.2f%%", io_u_lat_m[i]);
 
 	/* disk util stats, if any */
-	show_disk_util(1, NULL, out);
+	if (ver >= 3)
+		show_disk_util(1, NULL, out);
 
 	/* Additional output if continue_on_error set - default off*/
 	if (ts->continue_on_error)
 		log_buf(out, ";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
+	if (ver == 2)
+		log_buf(out, "\n");
 
 	/* Additional output if description is set */
 	if (strlen(ts->description))
@@ -1154,6 +1195,7 @@
 	struct json_object *root, *tmp;
 	struct jobs_eta *je;
 	double io_u_dist[FIO_IO_U_MAP_NR];
+	double io_u_lat_n[FIO_IO_U_LAT_N_NR];
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
 	double usr_cpu, sys_cpu;
@@ -1198,6 +1240,7 @@
 
 	/* Calc % distribution of IO depths, usecond, msecond latency */
 	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
+	stat_calc_lat_n(ts, io_u_lat_n);
 	stat_calc_lat_u(ts, io_u_lat_u);
 	stat_calc_lat_m(ts, io_u_lat_m);
 
@@ -1213,9 +1256,17 @@
 		json_object_add_value_float(tmp, (const char *)name, io_u_dist[i]);
 	}
 
+	/* Nanosecond latency */
 	tmp = json_create_object();
-	json_object_add_value_object(root, "latency_us", tmp);
+	json_object_add_value_object(root, "latency_ns", tmp);
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++) {
+		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
+				 "250", "500", "750", "1000", };
+		json_object_add_value_float(tmp, ranges[i], io_u_lat_n[i]);
+	}
 	/* Microsecond latency */
+	tmp = json_create_object();
+	json_object_add_value_object(root, "latency_us", tmp);
 	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
 		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
 				 "250", "500", "750", "1000", };
@@ -1344,10 +1395,8 @@
 				     struct group_run_stats *rs,
 				     struct buf_output *out)
 {
-	if (terse_version == 2)
-		show_thread_status_terse_v2(ts, rs, out);
-	else if (terse_version == 3 || terse_version == 4)
-		show_thread_status_terse_v3_v4(ts, rs, terse_version, out);
+	if (terse_version >= 2 && terse_version <= 5)
+		show_thread_status_terse_all(ts, rs, terse_version, out);
 	else
 		log_err("fio: bad terse version!? %d\n", terse_version);
 }
@@ -1418,7 +1467,7 @@
 		if (dst->min_bw[i] && dst->min_bw[i] > src->min_bw[i])
 			dst->min_bw[i] = src->min_bw[i];
 
-		dst->io_kb[i] += src->io_kb[i];
+		dst->iobytes[i] += src->iobytes[i];
 		dst->agg[i] += src->agg[i];
 	}
 
@@ -1439,6 +1488,7 @@
 			sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first);
 			sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first);
 			sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first);
+			sum_stat(&dst->iops_stat[l], &src->iops_stat[l], first);
 
 			dst->io_bytes[l] += src->io_bytes[l];
 
@@ -1449,6 +1499,7 @@
 			sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first);
 			sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first);
 			sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first);
+			sum_stat(&dst->iops_stat[0], &src->iops_stat[l], first);
 
 			dst->io_bytes[0] += src->io_bytes[l];
 
@@ -1475,6 +1526,8 @@
 		dst->io_u_submit[k] += src->io_u_submit[k];
 	for (k = 0; k < FIO_IO_U_MAP_NR; k++)
 		dst->io_u_complete[k] += src->io_u_complete[k];
+	for (k = 0; k < FIO_IO_U_LAT_N_NR; k++)
+		dst->io_u_lat_n[k] += src->io_u_lat_n[k];
 	for (k = 0; k < FIO_IO_U_LAT_U_NR; k++)
 		dst->io_u_lat_u[k] += src->io_u_lat_u[k];
 	for (k = 0; k < FIO_IO_U_LAT_M_NR; k++)
@@ -1528,6 +1581,7 @@
 		ts->clat_stat[j].min_val = -1UL;
 		ts->slat_stat[j].min_val = -1UL;
 		ts->bw_stat[j].min_val = -1UL;
+		ts->iops_stat[j].min_val = -1UL;
 	}
 	ts->groupid = -1;
 }
@@ -1538,8 +1592,8 @@
 	struct thread_data *td;
 	struct thread_stat *threadstats, *ts;
 	int i, j, k, nr_ts, last_ts, idx;
-	int kb_base_warned = 0;
-	int unit_base_warned = 0;
+	bool kb_base_warned = false;
+	bool unit_base_warned = false;
 	struct json_object *root = NULL;
 	struct json_array *array = NULL;
 	struct buf_output output[FIO_OUTPUT_NR];
@@ -1563,6 +1617,8 @@
 		}
 		if (last_ts == td->groupid)
 			continue;
+		if (!td->o.stats)
+			continue;
 
 		last_ts = td->groupid;
 		nr_ts++;
@@ -1580,6 +1636,8 @@
 	last_ts = -1;
 	idx = 0;
 	for_each_td(td, i) {
+		if (!td->o.stats)
+			continue;
 		if (idx && (!td->o.group_reporting ||
 		    (td->o.group_reporting && last_ts != td->groupid))) {
 			idx = 0;
@@ -1591,6 +1649,7 @@
 		ts = &threadstats[j];
 
 		ts->clat_percentiles = td->o.clat_percentiles;
+		ts->lat_percentiles = td->o.lat_percentiles;
 		ts->percentile_precision = td->o.percentile_precision;
 		memcpy(ts->percentile_list, td->o.percentile_list, sizeof(td->o.percentile_list));
 		opt_lists[j] = &td->opt_list;
@@ -1627,11 +1686,11 @@
 		} else if (ts->kb_base != td->o.kb_base && !kb_base_warned) {
 			log_info("fio: kb_base differs for jobs in group, using"
 				 " %u as the base\n", ts->kb_base);
-			kb_base_warned = 1;
+			kb_base_warned = true;
 		} else if (ts->unit_base != td->o.unit_base && !unit_base_warned) {
 			log_info("fio: unit_base differs for jobs in group, using"
 				 " %u as the base\n", ts->unit_base);
-			unit_base_warned = 1;
+			unit_base_warned = true;
 		}
 
 		ts->continue_on_error = td->o.continue_on_error;
@@ -1696,19 +1755,14 @@
 				rs->max_run[j] = ts->runtime[j];
 
 			bw = 0;
-			if (ts->runtime[j]) {
-				unsigned long runt = ts->runtime[j];
-				unsigned long long kb;
-
-				kb = ts->io_bytes[j] / rs->kb_base;
-				bw = kb * 1000 / runt;
-			}
+			if (ts->runtime[j])
+				bw = ts->io_bytes[j] * 1000 / ts->runtime[j];
 			if (bw < rs->min_bw[j])
 				rs->min_bw[j] = bw;
 			if (bw > rs->max_bw[j])
 				rs->max_bw[j] = bw;
 
-			rs->io_kb[j] += ts->io_bytes[j] / rs->kb_base;
+			rs->iobytes[j] += ts->io_bytes[j];
 		}
 	}
 
@@ -1719,7 +1773,7 @@
 
 		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 			if (rs->max_run[ddir])
-				rs->agg[ddir] = (rs->io_kb[ddir] * 1000) /
+				rs->agg[ddir] = (rs->iobytes[ddir] * 1000) /
 						rs->max_run[ddir];
 		}
 	}
@@ -1808,8 +1862,10 @@
 	}
 
 	for (i = 0; i < FIO_OUTPUT_NR; i++) {
-		buf_output_flush(&output[i]);
-		buf_output_free(&output[i]);
+		struct buf_output *out = &output[i];
+
+		log_info_buf(out->buf, out->buflen);
+		buf_output_free(out);
 	}
 
 	log_info_flush();
@@ -1829,22 +1885,22 @@
 {
 	struct thread_data *td;
 	unsigned long long *rt;
-	struct timeval tv;
+	struct timespec ts;
 	int i;
 
 	fio_mutex_down(stat_mutex);
 
 	rt = malloc(thread_number * sizeof(unsigned long long));
-	fio_gettime(&tv, NULL);
+	fio_gettime(&ts, NULL);
 
 	for_each_td(td, i) {
 		td->update_rusage = 1;
 		td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
 		td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE];
 		td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
-		td->ts.total_run_time = mtime_since(&td->epoch, &tv);
+		td->ts.total_run_time = mtime_since(&td->epoch, &ts);
 
-		rt[i] = mtime_since(&td->start, &tv);
+		rt[i] = mtime_since(&td->start, &ts);
 		if (td_read(td) && td->ts.io_bytes[DDIR_READ])
 			td->ts.runtime[DDIR_READ] += rt[i];
 		if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
@@ -1878,9 +1934,9 @@
 	fio_mutex_up(stat_mutex);
 }
 
-static int status_interval_init;
-static struct timeval status_time;
-static int status_file_disabled;
+static bool status_interval_init;
+static struct timespec status_time;
+static bool status_file_disabled;
 
 #define FIO_STATUS_FILE		"fio-dump-status"
 
@@ -1911,7 +1967,7 @@
 		log_err("fio: failed to unlink %s: %s\n", fio_status_file_path,
 							strerror(errno));
 		log_err("fio: disabling status file updates\n");
-		status_file_disabled = 1;
+		status_file_disabled = true;
 	}
 
 	return 1;
@@ -1922,7 +1978,7 @@
 	if (status_interval) {
 		if (!status_interval_init) {
 			fio_gettime(&status_time, NULL);
-			status_interval_init = 1;
+			status_interval_init = true;
 		} else if (mtime_since_now(&status_time) >= status_interval) {
 			show_running_run_stats();
 			fio_gettime(&status_time, NULL);
@@ -1935,7 +1991,7 @@
 	}
 }
 
-static inline void add_stat_sample(struct io_stat *is, unsigned long data)
+static inline void add_stat_sample(struct io_stat *is, unsigned long long data)
 {
 	double val = data;
 	double delta;
@@ -2108,7 +2164,7 @@
 	if (iolog->disabled)
 		return;
 	if (flist_empty(&iolog->io_logs))
-		iolog->avg_last = t;
+		iolog->avg_last[ddir] = t;
 
 	cur_log = get_cur_log(iolog);
 	if (cur_log) {
@@ -2154,6 +2210,9 @@
 
 		ts->io_bytes[i] = 0;
 		ts->runtime[i] = 0;
+		ts->total_io_u[i] = 0;
+		ts->short_io_u[i] = 0;
+		ts->drop_io_u[i] = 0;
 
 		for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
 			ts->io_u_plat[i][j] = 0;
@@ -2163,17 +2222,17 @@
 		ts->io_u_map[i] = 0;
 		ts->io_u_submit[i] = 0;
 		ts->io_u_complete[i] = 0;
+	}
+
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		ts->io_u_lat_n[i] = 0;
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
 		ts->io_u_lat_u[i] = 0;
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
 		ts->io_u_lat_m[i] = 0;
-		ts->total_submit = 0;
-		ts->total_complete = 0;
-	}
 
-	for (i = 0; i < 3; i++) {
-		ts->total_io_u[i] = 0;
-		ts->short_io_u[i] = 0;
-		ts->drop_io_u[i] = 0;
-	}
+	ts->total_submit = 0;
+	ts->total_complete = 0;
 }
 
 static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
@@ -2236,9 +2295,9 @@
 	 * If period hasn't passed, adding the above sample is all we
 	 * need to do.
 	 */
-	this_window = elapsed - iolog->avg_last;
-	if (elapsed < iolog->avg_last)
-		return iolog->avg_last - elapsed;
+	this_window = elapsed - iolog->avg_last[ddir];
+	if (elapsed < iolog->avg_last[ddir])
+		return iolog->avg_last[ddir] - elapsed;
 	else if (this_window < iolog->avg_msec) {
 		int diff = iolog->avg_msec - this_window;
 
@@ -2246,9 +2305,9 @@
 			return diff;
 	}
 
-	_add_stat_to_log(iolog, elapsed, td->o.log_max != 0);
+	__add_stat_to_log(iolog, ddir, elapsed, td->o.log_max != 0);
 
-	iolog->avg_last = elapsed - (this_window - iolog->avg_msec);
+	iolog->avg_last[ddir] = elapsed - (this_window - iolog->avg_msec);
 	return iolog->avg_msec;
 }
 
@@ -2282,16 +2341,16 @@
 }
 
 static void add_clat_percentile_sample(struct thread_stat *ts,
-				unsigned long usec, enum fio_ddir ddir)
+				unsigned long long nsec, enum fio_ddir ddir)
 {
-	unsigned int idx = plat_val_to_idx(usec);
+	unsigned int idx = plat_val_to_idx(nsec);
 	assert(idx < FIO_IO_U_PLAT_NR);
 
 	ts->io_u_plat[ddir][idx]++;
 }
 
 void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
-		     unsigned long usec, unsigned int bs, uint64_t offset)
+		     unsigned long long nsec, unsigned int bs, uint64_t offset)
 {
 	unsigned long elapsed, this_window;
 	struct thread_stat *ts = &td->ts;
@@ -2299,14 +2358,14 @@
 
 	td_io_u_lock(td);
 
-	add_stat_sample(&ts->clat_stat[ddir], usec);
+	add_stat_sample(&ts->clat_stat[ddir], nsec);
 
 	if (td->clat_log)
-		add_log_sample(td, td->clat_log, sample_val(usec), ddir, bs,
+		add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs,
 			       offset);
 
 	if (ts->clat_percentiles)
-		add_clat_percentile_sample(ts, usec, ddir);
+		add_clat_percentile_sample(ts, nsec, ddir);
 
 	if (iolog && iolog->hist_msec) {
 		struct io_hist *hw = &iolog->hist_window[ddir];
@@ -2368,7 +2427,7 @@
 }
 
 void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
-		    unsigned long usec, unsigned int bs, uint64_t offset)
+		    unsigned long long nsec, unsigned int bs, uint64_t offset)
 {
 	struct thread_stat *ts = &td->ts;
 
@@ -2377,23 +2436,26 @@
 
 	td_io_u_lock(td);
 
-	add_stat_sample(&ts->lat_stat[ddir], usec);
+	add_stat_sample(&ts->lat_stat[ddir], nsec);
 
 	if (td->lat_log)
-		add_log_sample(td, td->lat_log, sample_val(usec), ddir, bs,
+		add_log_sample(td, td->lat_log, sample_val(nsec), ddir, bs,
 			       offset);
 
+	if (ts->lat_percentiles)
+		add_clat_percentile_sample(ts, nsec, ddir);
+
 	td_io_u_unlock(td);
 }
 
 void add_bw_sample(struct thread_data *td, struct io_u *io_u,
-		   unsigned int bytes, unsigned long spent)
+		   unsigned int bytes, unsigned long long spent)
 {
 	struct thread_stat *ts = &td->ts;
 	unsigned long rate;
 
 	if (spent)
-		rate = bytes * 1000 / spent;
+		rate = (unsigned long) (bytes * 1000000ULL / spent);
 	else
 		rate = 0;
 
@@ -2409,64 +2471,76 @@
 	td_io_u_unlock(td);
 }
 
-static int add_bw_samples(struct thread_data *td, struct timeval *t)
+static int __add_samples(struct thread_data *td, struct timespec *parent_tv,
+			 struct timespec *t, unsigned int avg_time,
+			 uint64_t *this_io_bytes, uint64_t *stat_io_bytes,
+			 struct io_stat *stat, struct io_log *log,
+			 bool is_kb)
 {
-	struct thread_stat *ts = &td->ts;
 	unsigned long spent, rate;
 	enum fio_ddir ddir;
 	unsigned int next, next_log;
 
-	next_log = td->o.bw_avg_time;
+	next_log = avg_time;
 
-	spent = mtime_since(&td->bw_sample_time, t);
-	if (spent < td->o.bw_avg_time &&
-	    td->o.bw_avg_time - spent >= LOG_MSEC_SLACK)
-		return td->o.bw_avg_time - spent;
+	spent = mtime_since(parent_tv, t);
+	if (spent < avg_time && avg_time - spent >= LOG_MSEC_SLACK)
+		return avg_time - spent;
 
 	td_io_u_lock(td);
 
 	/*
 	 * Compute both read and write rates for the interval.
 	 */
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 		uint64_t delta;
 
-		delta = td->this_io_bytes[ddir] - td->stat_io_bytes[ddir];
+		delta = this_io_bytes[ddir] - stat_io_bytes[ddir];
 		if (!delta)
 			continue; /* No entries for interval */
 
-		if (spent)
-			rate = delta * 1000 / spent / 1024;
-		else
+		if (spent) {
+			if (is_kb)
+				rate = delta * 1000 / spent / 1024; /* KiB/s */
+			else
+				rate = (delta * 1000) / spent;
+		} else
 			rate = 0;
 
-		add_stat_sample(&ts->bw_stat[ddir], rate);
+		add_stat_sample(&stat[ddir], rate);
 
-		if (td->bw_log) {
+		if (log) {
 			unsigned int bs = 0;
 
 			if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
 				bs = td->o.min_bs[ddir];
 
-			next = add_log_sample(td, td->bw_log, sample_val(rate),
-					      ddir, bs, 0);
+			next = add_log_sample(td, log, sample_val(rate), ddir, bs, 0);
 			next_log = min(next_log, next);
 		}
 
-		td->stat_io_bytes[ddir] = td->this_io_bytes[ddir];
+		stat_io_bytes[ddir] = this_io_bytes[ddir];
 	}
 
-	timeval_add_msec(&td->bw_sample_time, td->o.bw_avg_time);
+	timespec_add_msec(parent_tv, avg_time);
 
 	td_io_u_unlock(td);
 
-	if (spent <= td->o.bw_avg_time)
-		return min(next_log, td->o.bw_avg_time);
+	if (spent <= avg_time)
+		next = avg_time;
+	else
+		next = avg_time - (1 + spent - avg_time);
 
-	next = td->o.bw_avg_time - (1 + spent - td->o.bw_avg_time);
 	return min(next, next_log);
 }
 
+static int add_bw_samples(struct thread_data *td, struct timespec *t)
+{
+	return __add_samples(td, &td->bw_sample_time, t, td->o.bw_avg_time,
+				td->this_io_bytes, td->stat_io_bytes,
+				td->ts.bw_stat, td->bw_log, true);
+}
+
 void add_iops_sample(struct thread_data *td, struct io_u *io_u,
 		     unsigned int bytes)
 {
@@ -2484,62 +2558,11 @@
 	td_io_u_unlock(td);
 }
 
-static int add_iops_samples(struct thread_data *td, struct timeval *t)
+static int add_iops_samples(struct thread_data *td, struct timespec *t)
 {
-	struct thread_stat *ts = &td->ts;
-	unsigned long spent, iops;
-	enum fio_ddir ddir;
-	unsigned int next, next_log;
-
-	next_log = td->o.iops_avg_time;
-
-	spent = mtime_since(&td->iops_sample_time, t);
-	if (spent < td->o.iops_avg_time &&
-	    td->o.iops_avg_time - spent >= LOG_MSEC_SLACK)
-		return td->o.iops_avg_time - spent;
-
-	td_io_u_lock(td);
-
-	/*
-	 * Compute both read and write rates for the interval.
-	 */
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
-		uint64_t delta;
-
-		delta = td->this_io_blocks[ddir] - td->stat_io_blocks[ddir];
-		if (!delta)
-			continue; /* No entries for interval */
-
-		if (spent)
-			iops = (delta * 1000) / spent;
-		else
-			iops = 0;
-
-		add_stat_sample(&ts->iops_stat[ddir], iops);
-
-		if (td->iops_log) {
-			unsigned int bs = 0;
-
-			if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
-				bs = td->o.min_bs[ddir];
-
-			next = add_log_sample(td, td->iops_log,
-					      sample_val(iops), ddir, bs, 0);
-			next_log = min(next_log, next);
-		}
-
-		td->stat_io_blocks[ddir] = td->this_io_blocks[ddir];
-	}
-
-	timeval_add_msec(&td->iops_sample_time, td->o.iops_avg_time);
-
-	td_io_u_unlock(td);
-
-	if (spent <= td->o.iops_avg_time)
-		return min(next_log, td->o.iops_avg_time);
-
-	next = td->o.iops_avg_time - (1 + spent - td->o.iops_avg_time);
-	return min(next, next_log);
+	return __add_samples(td, &td->iops_sample_time, t, td->o.iops_avg_time,
+				td->this_io_blocks, td->stat_io_blocks,
+				td->ts.iops_stat, td->iops_log, false);
 }
 
 /*
@@ -2549,23 +2572,27 @@
 {
 	struct thread_data *td;
 	unsigned int next = ~0U, tmp;
-	struct timeval now;
+	struct timespec now;
 	int i;
 
 	fio_gettime(&now, NULL);
 
 	for_each_td(td, i) {
+		if (!td->o.stats)
+			continue;
 		if (in_ramp_time(td) ||
 		    !(td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING)) {
 			next = min(td->o.iops_avg_time, td->o.bw_avg_time);
 			continue;
 		}
-		if (td->bw_log && !per_unit_log(td->bw_log)) {
+		if (!td->bw_log ||
+			(td->bw_log && !per_unit_log(td->bw_log))) {
 			tmp = add_bw_samples(td, &now);
 			if (tmp < next)
 				next = tmp;
 		}
-		if (td->iops_log && !per_unit_log(td->iops_log)) {
+		if (!td->iops_log ||
+			(td->iops_log && !per_unit_log(td->iops_log))) {
 			tmp = add_iops_samples(td, &now);
 			if (tmp < next)
 				next = tmp;
diff -Nru fio-2.16/stat.h fio-3.1/stat.h
--- fio-2.16/stat.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/stat.h	2017-09-28 10:23:20.000000000 +0000
@@ -7,7 +7,7 @@
 struct group_run_stats {
 	uint64_t max_run[DDIR_RWDIR_CNT], min_run[DDIR_RWDIR_CNT];
 	uint64_t max_bw[DDIR_RWDIR_CNT], min_bw[DDIR_RWDIR_CNT];
-	uint64_t io_kb[DDIR_RWDIR_CNT];
+	uint64_t iobytes[DDIR_RWDIR_CNT];
 	uint64_t agg[DDIR_RWDIR_CNT];
 	uint32_t kb_base;
 	uint32_t unit_base;
@@ -19,6 +19,7 @@
  * How many depth levels to log
  */
 #define FIO_IO_U_MAP_NR	7
+#define FIO_IO_U_LAT_N_NR 10
 #define FIO_IO_U_LAT_U_NR 10
 #define FIO_IO_U_LAT_M_NR 12
 
@@ -108,7 +109,7 @@
 
 #define FIO_IO_U_PLAT_BITS 6
 #define FIO_IO_U_PLAT_VAL (1 << FIO_IO_U_PLAT_BITS)
-#define FIO_IO_U_PLAT_GROUP_NR 19
+#define FIO_IO_U_PLAT_GROUP_NR 29
 #define FIO_IO_U_PLAT_NR (FIO_IO_U_PLAT_GROUP_NR * FIO_IO_U_PLAT_VAL)
 #define FIO_IO_U_LIST_MAX_LEN 20 /* The size of the default and user-specified
 					list of percentiles */
@@ -171,13 +172,15 @@
 	/*
 	 * IO depth and latency stats
 	 */
-	uint64_t clat_percentiles;
+	uint32_t clat_percentiles;
+	uint32_t lat_percentiles;
 	uint64_t percentile_precision;
 	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
 
 	uint32_t io_u_map[FIO_IO_U_MAP_NR];
 	uint32_t io_u_submit[FIO_IO_U_MAP_NR];
 	uint32_t io_u_complete[FIO_IO_U_MAP_NR];
+	uint32_t io_u_lat_n[FIO_IO_U_LAT_N_NR];
 	uint32_t io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	uint32_t io_u_lat_m[FIO_IO_U_LAT_M_NR];
 	uint32_t io_u_plat[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
@@ -242,17 +245,17 @@
 	uint32_t nr_pending;
 	uint32_t nr_setting_up;
 
-	uint32_t files_open;
-
 	uint64_t m_rate[DDIR_RWDIR_CNT], t_rate[DDIR_RWDIR_CNT];
-	uint32_t m_iops[DDIR_RWDIR_CNT], t_iops[DDIR_RWDIR_CNT];
 	uint64_t rate[DDIR_RWDIR_CNT];
+	uint32_t m_iops[DDIR_RWDIR_CNT], t_iops[DDIR_RWDIR_CNT];
 	uint32_t iops[DDIR_RWDIR_CNT];
 	uint64_t elapsed_sec;
 	uint64_t eta_sec;
 	uint32_t is_pow2;
 	uint32_t unit_base;
 
+	uint32_t files_open;
+
 	/*
 	 * Network 'copy' of run_str[]
 	 */
@@ -286,8 +289,9 @@
 extern void init_thread_stat(struct thread_stat *ts);
 extern void init_group_run_stat(struct group_run_stats *gs);
 extern void eta_to_str(char *str, unsigned long eta_sec);
-extern bool calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max, double *mean, double *dev);
-extern unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr, fio_fp64_t *plist, unsigned int **output, unsigned int *maxv, unsigned int *minv);
+extern bool calc_lat(struct io_stat *is, unsigned long long *min, unsigned long long *max, double *mean, double *dev);
+extern unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr, fio_fp64_t *plist, unsigned long long **output, unsigned long long *maxv, unsigned long long *minv);
+extern void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat);
 extern void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat);
 extern void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat);
 extern void stat_calc_dist(unsigned int *map, unsigned long total, double *io_u_dist);
@@ -295,9 +299,9 @@
 extern void update_rusage_stat(struct thread_data *);
 extern void clear_rusage_stat(struct thread_data *);
 
-extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long,
+extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
 				unsigned int, uint64_t);
-extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long,
+extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
 				unsigned int, uint64_t);
 extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long,
 				unsigned int, uint64_t);
@@ -305,16 +309,17 @@
 extern void add_iops_sample(struct thread_data *, struct io_u *,
 				unsigned int);
 extern void add_bw_sample(struct thread_data *, struct io_u *,
-				unsigned int, unsigned long);
+				unsigned int, unsigned long long);
 extern int calc_log_samples(void);
 
 extern struct io_log *agg_io_log[DDIR_RWDIR_CNT];
 extern int write_bw_log;
 
-static inline bool usec_to_msec(unsigned long *min, unsigned long *max,
-				double *mean, double *dev)
+static inline bool nsec_to_usec(unsigned long long *min,
+				unsigned long long *max, double *mean,
+				double *dev)
 {
-	if (*min > 1000 && *max > 1000 && *mean > 1000.0 && *dev > 1000.0) {
+	if (*min > 2000 && *max > 99999 && *dev > 1000.0) {
 		*min /= 1000;
 		*max /= 1000;
 		*mean /= 1000.0;
@@ -324,6 +329,22 @@
 
 	return false;
 }
+
+static inline bool nsec_to_msec(unsigned long long *min,
+				unsigned long long *max, double *mean,
+				double *dev)
+{
+	if (*min > 2000000 && *max > 99999999ULL && *dev > 1000000.0) {
+		*min /= 1000000;
+		*max /= 1000000;
+		*mean /= 1000000.0;
+		*dev /= 1000000.0;
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Worst level condensing would be 1:5, so allow enough room for that
  */
diff -Nru fio-2.16/steadystate.c fio-3.1/steadystate.c
--- fio-2.16/steadystate.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/steadystate.c	2017-09-28 10:23:20.000000000 +0000
@@ -8,13 +8,8 @@
 
 static void steadystate_alloc(struct thread_data *td)
 {
-	int i;
-
-	td->ss.bw_data = malloc(td->ss.dur * sizeof(uint64_t));
-	td->ss.iops_data = malloc(td->ss.dur * sizeof(uint64_t));
-	/* initialize so that it is obvious if the cache is not full in the output */
-	for (i = 0; i < td->ss.dur; i++)
-		td->ss.iops_data[i] = td->ss.bw_data[i] = 0;
+	td->ss.bw_data = calloc(td->ss.dur, sizeof(uint64_t));
+	td->ss.iops_data = calloc(td->ss.dur, sizeof(uint64_t));
 
 	td->ss.state |= __FIO_SS_DATA;
 }
@@ -201,7 +196,7 @@
 	int i, j, ddir, prev_groupid, group_ramp_time_over = 0;
 	unsigned long rate_time;
 	struct thread_data *td, *td2;
-	struct timeval now;
+	struct timespec now;
 	uint64_t group_bw = 0, group_iops = 0;
 	uint64_t td_iops, td_bytes;
 	bool ret;
@@ -236,7 +231,7 @@
 		}
 
 		td_io_u_lock(td);
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 			td_iops += td->io_blocks[ddir];
 			td_bytes += td->io_bytes[ddir];
 		}
diff -Nru fio-2.16/steadystate.h fio-3.1/steadystate.h
--- fio-2.16/steadystate.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/steadystate.h	2017-09-28 10:23:20.000000000 +0000
@@ -35,7 +35,7 @@
 	uint64_t sum_xy;
 	uint64_t oldest_y;
 
-	struct timeval prev_time;
+	struct timespec prev_time;
 	uint64_t prev_iops;
 	uint64_t prev_bytes;
 };
diff -Nru fio-2.16/t/arch.c fio-3.1/t/arch.c
--- fio-2.16/t/arch.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/arch.c	2017-09-28 10:23:20.000000000 +0000
@@ -1,5 +1,5 @@
 #include "../arch/arch.h"
 
 unsigned long arch_flags = 0;
-int tsc_reliable;
+bool tsc_reliable;
 int arch_random;
diff -Nru fio-2.16/t/axmap.c fio-3.1/t/axmap.c
--- fio-2.16/t/axmap.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/axmap.c	2017-09-28 10:23:20.000000000 +0000
@@ -8,16 +8,6 @@
 #include "../lib/lfsr.h"
 #include "../lib/axmap.h"
 
-void *smalloc(size_t size)
-{
-	return malloc(size);
-}
-
-void sfree(void *ptr)
-{
-	free(ptr);
-}
-
 static int test_regular(size_t size, int seed)
 {
 	struct fio_lfsr lfsr;
diff -Nru fio-2.16/t/btrace2fio.c fio-3.1/t/btrace2fio.c
--- fio-2.16/t/btrace2fio.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/btrace2fio.c	2017-09-28 10:23:20.000000000 +0000
@@ -62,7 +62,7 @@
 
 	uint64_t first_ttime[DDIR_RWDIR_CNT];
 	uint64_t last_ttime[DDIR_RWDIR_CNT];
-	uint64_t kb[DDIR_RWDIR_CNT];
+	uint64_t kib[DDIR_RWDIR_CNT];
 
 	uint64_t start_delay;
 };
@@ -406,7 +406,7 @@
 
 		i = inflight_find(t->sector + (t->bytes >> 9));
 		if (i) {
-			i->p->o.kb[t_to_rwdir(t)] += (t->bytes >> 10);
+			i->p->o.kib[t_to_rwdir(t)] += (t->bytes >> 10);
 			i->p->o.complete_seen = 1;
 			inflight_remove(i);
 		}
@@ -556,7 +556,7 @@
 	return bsb->nr - bsa->nr;
 }
 
-static unsigned long o_to_kb_rate(struct btrace_out *o, int rw)
+static unsigned long o_to_kib_rate(struct btrace_out *o, int rw)
 {
 	uint64_t usec = (o->last_ttime[rw] - o->first_ttime[rw]) / 1000ULL;
 	uint64_t val;
@@ -568,7 +568,7 @@
 	if (!usec)
 		return 0;
 
-	val = o->kb[rw] * 1000ULL;
+	val = o->kib[rw] * 1000ULL;
 	return val / usec;
 }
 
@@ -623,7 +623,7 @@
 		printf("\tmerges: %lu (perc=%3.2f%%)\n", o->merges[i], perc);
 		perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i];
 		printf("\tseq:    %lu (perc=%3.2f%%)\n", (unsigned long) o->seq[i], perc);
-		printf("\trate:   %lu KB/sec\n", o_to_kb_rate(o, i));
+		printf("\trate:   %lu KiB/sec\n", o_to_kib_rate(o, i));
 
 		for (j = 0; j < o->nr_bs[i]; j++) {
 			struct bs *bs = &o->bs[i][j];
@@ -746,7 +746,7 @@
 		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 			unsigned long rate;
 
-			rate = o_to_kb_rate(o, i);
+			rate = o_to_kib_rate(o, i);
 			if (i)
 				printf(",");
 			if (rate)
@@ -810,7 +810,7 @@
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		unsigned long this_rate;
 
-		this_rate = o_to_kb_rate(o, i);
+		this_rate = o_to_kib_rate(o, i);
 		if (this_rate < rate_threshold) {
 			remove_ddir(o, i);
 			this_rate = 0;
@@ -926,7 +926,7 @@
 		oa->ios[i] += ob->ios[i];
 		oa->merges[i] += ob->merges[i];
 		oa->seq[i] += ob->seq[i];
-		oa->kb[i] += ob->kb[i];
+		oa->kib[i] += ob->kib[i];
 		oa->first_ttime[i] = min(oa->first_ttime[i], ob->first_ttime[i]);
 		oa->last_ttime[i] = max(oa->last_ttime[i], ob->last_ttime[i]);
 		merge_bs(&oa->bs[i], &oa->nr_bs[i], ob->bs[i], ob->nr_bs[i]);
@@ -1021,7 +1021,7 @@
 	log_err("\t-n\tNumber IOS threshold to ignore task\n");
 	log_err("\t-f\tFio job file output\n");
 	log_err("\t-d\tUse this file/device for replay\n");
-	log_err("\t-r\tIgnore jobs with less than this KB/sec rate\n");
+	log_err("\t-r\tIgnore jobs with less than this KiB/sec rate\n");
 	log_err("\t-R\tSet rate in fio job (def=%u)\n", set_rate);
 	log_err("\t-D\tCap queue depth at this value (def=%u)\n", max_depth);
 	log_err("\t-c\tCollapse \"identical\" jobs (def=%u)\n", collapse_entries);
diff -Nru fio-2.16/t/debug.c fio-3.1/t/debug.c
--- fio-2.16/t/debug.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/debug.c	2017-09-28 10:23:20.000000000 +0000
@@ -1,7 +1,7 @@
 #include <stdio.h>
 
 FILE *f_err;
-struct timeval *fio_tv = NULL;
+struct timespec *fio_ts = NULL;
 unsigned long fio_debug = 0;
 
 void __dprint(int type, const char *str, ...)
diff -Nru fio-2.16/t/dedupe.c fio-3.1/t/dedupe.c
--- fio-2.16/t/dedupe.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/dedupe.c	2017-09-28 10:23:20.000000000 +0000
@@ -14,7 +14,6 @@
 #include <fcntl.h>
 #include <string.h>
 
-#include "../lib/rbtree.h"
 #include "../flist.h"
 #include "../log.h"
 #include "../mutex.h"
@@ -25,6 +24,7 @@
 #include "../os/os.h"
 #include "../gettime.h"
 #include "../fio_time.h"
+#include "../lib/rbtree.h"
 
 #include "../lib/bloom.h"
 #include "debug.h"
@@ -334,7 +334,7 @@
 static void show_progress(struct worker_thread *threads, unsigned long total)
 {
 	unsigned long last_nitems = 0;
-	struct timeval last_tv;
+	struct timespec last_tv;
 
 	fio_gettime(&last_tv, NULL);
 
@@ -363,7 +363,7 @@
 		tdiff = mtime_since_now(&last_tv);
 		if (tdiff) {
 			this_items = (this_items * 1000) / (tdiff * 1024);
-			printf("%3.2f%% done (%luKB/sec)\r", perc, this_items);
+			printf("%3.2f%% done (%luKiB/sec)\r", perc, this_items);
 			last_nitems = nitems;
 			fio_gettime(&last_tv, NULL);
 		} else
diff -Nru fio-2.16/t/genzipf.c fio-3.1/t/genzipf.c
--- fio-2.16/t/genzipf.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/genzipf.c	2017-09-28 10:23:20.000000000 +0000
@@ -3,8 +3,8 @@
  * what an access pattern would look like.
  *
  * For instance, the following would generate a zipf distribution
- * with theta 1.2, using 262144 (1 GB / 4096) values and split the reporting into
- * 20 buckets:
+ * with theta 1.2, using 262144 (1 GiB / 4096) values and split the
+ * reporting into 20 buckets:
  *
  *	./t/fio-genzipf -t zipf -i 1.2 -g 1 -b 4096 -o 20
  *
@@ -49,7 +49,7 @@
 };
 
 static int dist_type = TYPE_ZIPF;
-static unsigned long gb_size = 500;
+static unsigned long gib_size = 500;
 static unsigned long block_size = 4096;
 static unsigned long output_nranges = DEF_NR_OUTPUT;
 static double percentage;
@@ -131,7 +131,7 @@
 			}
 			break;
 		case 'g':
-			gb_size = strtoul(optarg, NULL, 10);
+			gib_size = strtoul(optarg, NULL, 10);
 			break;
 		case 'i':
 			dist_val = atof(optarg);
@@ -291,9 +291,10 @@
 		return 1;
 
 	if (output_type != OUTPUT_CSV)
-		printf("Generating %s distribution with %f input and %lu GB size and %lu block_size.\n", dist_types[dist_type], dist_val, gb_size, block_size);
+		printf("Generating %s distribution with %f input and %lu GiB size and %lu block_size.\n",
+		       dist_types[dist_type], dist_val, gib_size, block_size);
 
-	nranges = gb_size * 1024 * 1024 * 1024ULL;
+	nranges = gib_size * 1024 * 1024 * 1024ULL;
 	nranges /= block_size;
 
 	if (dist_type == TYPE_ZIPF)
diff -Nru fio-2.16/t/lfsr-test.c fio-3.1/t/lfsr-test.c
--- fio-2.16/t/lfsr-test.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/lfsr-test.c	2017-09-28 10:23:20.000000000 +0000
@@ -27,7 +27,7 @@
 int main(int argc, char *argv[])
 {
 	int r;
-	struct timeval start, end;
+	struct timespec start, end;
 	struct fio_lfsr *fl;
 	int verify = 0;
 	unsigned int spin = 0;
@@ -80,7 +80,7 @@
 		v_size = numbers * sizeof(uint8_t);
 		v = malloc(v_size);
 		memset(v, 0, v_size);
-		printf("\nVerification table is %lf KBs\n", (double)(v_size) / 1024);
+		printf("\nVerification table is %lf KiB\n", (double)(v_size) / 1024);
 	}
 	v_start = v;
 
diff -Nru fio-2.16/t/log.c fio-3.1/t/log.c
--- fio-2.16/t/log.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/log.c	2017-09-28 10:23:20.000000000 +0000
@@ -2,7 +2,7 @@
 #include <stdarg.h>
 #include "../minmax.h"
 
-int log_err(const char *format, ...)
+size_t log_err(const char *format, ...)
 {
 	char buffer[1024];
 	va_list args;
@@ -16,7 +16,7 @@
 	return fwrite(buffer, len, 1, stderr);
 }
 
-int log_info(const char *format, ...)
+size_t log_info(const char *format, ...)
 {
 	char buffer[1024];
 	va_list args;
diff -Nru fio-2.16/t/memlock.c fio-3.1/t/memlock.c
--- fio-2.16/t/memlock.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/memlock.c	2017-09-28 10:23:20.000000000 +0000
@@ -4,7 +4,7 @@
 #include <pthread.h>
 
 static struct thread_data {
-	unsigned long mb;
+	unsigned long mib;
 } td;
 
 static void *worker(void *data)
@@ -15,14 +15,14 @@
 	char *buf;
 	int i, first = 1;
 
-	size = td->mb * 1024UL * 1024UL;
+	size = td->mib * 1024UL * 1024UL;
 	buf = malloc(size);
 
 	for (i = 0; i < 100000; i++) {
 		for (index = 0; index + 4096 < size; index += 4096)
 			memset(&buf[index+512], 0x89, 512);
 		if (first) {
-			printf("loop%d: did %lu MB\n", i+1, size/(1024UL*1024UL));
+			printf("loop%d: did %lu MiB\n", i+1, size/(1024UL*1024UL));
 			first = 0;
 		}
 	}
@@ -31,20 +31,20 @@
 
 int main(int argc, char *argv[])
 {
-	unsigned long mb, threads;
+	unsigned long mib, threads;
 	pthread_t *pthreads;
 	int i;
 
 	if (argc < 3) {
-		printf("%s: <mb per thread> <threads>\n", argv[0]);
+		printf("%s: <MiB per thread> <threads>\n", argv[0]);
 		return 1;
 	}
 
-	mb = strtoul(argv[1], NULL, 10);
+	mib = strtoul(argv[1], NULL, 10);
 	threads = strtoul(argv[2], NULL, 10);
 
 	pthreads = calloc(threads, sizeof(pthread_t));
-	td.mb = mb;
+	td.mib = mib;
 
 	for (i = 0; i < threads; i++)
 		pthread_create(&pthreads[i], NULL, worker, &td);
diff -Nru fio-2.16/t/read-to-pipe-async.c fio-3.1/t/read-to-pipe-async.c
--- fio-2.16/t/read-to-pipe-async.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/read-to-pipe-async.c	2017-09-28 10:23:20.000000000 +0000
@@ -661,9 +661,9 @@
 
 	bytes /= 1024;
 	rate = (bytes * 1000UL * 1000UL) / utime_since(&s, &re);
-	fprintf(stderr, "Read rate (KB/sec) : %lu\n", rate);
+	fprintf(stderr, "Read rate (KiB/sec) : %lu\n", rate);
 	rate = (bytes * 1000UL * 1000UL) / utime_since(&s, &we);
-	fprintf(stderr, "Write rate (KB/sec): %lu\n", rate);
+	fprintf(stderr, "Write rate (KiB/sec): %lu\n", rate);
 
 	close(fd);
 	return 0;
diff -Nru fio-2.16/t/stest.c fio-3.1/t/stest.c
--- fio-2.16/t/stest.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/stest.c	2017-09-28 10:23:20.000000000 +0000
@@ -59,15 +59,6 @@
 	return 0;
 }
 
-static int do_specific_alloc(unsigned long size)
-{
-	void *ptr;
-
-	ptr = smalloc(size);
-	sfree(ptr);
-	return 0;
-}
-
 int main(int argc, char *argv[])
 {
 	arch_init(argv);
@@ -76,9 +67,6 @@
 
 	do_rand_allocs();
 
-	/* smalloc bug, commit 271067a6 */
-	do_specific_alloc(671386584);
-
 	scleanup();
 	return 0;
 }
diff -Nru fio-2.16/t/time-test.c fio-3.1/t/time-test.c
--- fio-2.16/t/time-test.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/t/time-test.c	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,544 @@
+/*
+ * Carry out arithmetic to explore conversion of CPU clock ticks to nsec
+ *
+ * When we use the CPU clock for timing, we do the following:
+ *
+ * 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks
+ *    to actual time.
+ *
+ *    Using gettimeofday() or clock_gettime(), count how many CPU clock
+ *    ticks occur per usec
+ *
+ * 2) Calculate conversion factors so that we can ultimately convert
+ *    from clocks ticks to nsec with
+ *      nsec = (ticks * clock_mult) >> clock_shift
+ *
+ *    This is equivalent to
+ *	nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER
+ *    where
+ *	clock_mult = MULTIPLIER / cycles_per_nsec
+ *      MULTIPLIER = 2^clock_shift
+ *
+ *    It would be simpler to just calculate nsec = ticks / cycles_per_nsec,
+ *    but all of this is necessary because of rounding when calculating
+ *    cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply
+ *    be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional
+ *    portion is lost with integer arithmetic.
+ *
+ *    This multiply and shift calculation also has a performance benefit
+ *    as multiplication and bit shift operations are faster than integer
+ *    division.
+ *
+ * 3) Dynamically determine clock_shift and clock_mult at run time based
+ *    on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum
+ *    duration for which the conversion will be valid.
+ *
+ *    The primary constraint is that (ticks * clock_mult) must not overflow
+ *    when ticks is at its maximum value.
+ *
+ *    So we have
+ *	max_ticks = MAX_CLOCK_SEC * 1000000000 * cycles_per_nsec
+ *	max_ticks * clock_mult <= ULLONG_MAX
+ *	max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX
+ *      MULTIPLIER <= ULLONG_MAX * cycles_per_nsec / max_ticks
+ *
+ *    Then choose the largest clock_shift that satisfies
+ *	2^clock_shift <= ULLONG_MAX * cycles_per_nsec / max_ticks
+ *
+ *    Finally calculate the appropriate clock_mult associated with clock_shift
+ *	clock_mult = 2^clock_shift / cycles_per_nsec
+ *
+ * 4) In the code below we have cycles_per_usec and use
+ *	cycles_per_nsec = cycles_per_usec / 1000
+ *
+ *
+ * The code below implements 4 clock tick to nsec conversion strategies
+ *
+ *   i) 64-bit arithmetic for the (ticks * clock_mult) product with the
+ *	conversion valid for at most MAX_CLOCK_SEC
+ *
+ *  ii) NOT IMPLEMENTED Use 64-bit integers to emulate 128-bit multiplication
+ *	for the (ticks * clock_mult) product
+ *
+ * iii) 64-bit arithmetic with clock ticks to nsec conversion occurring in
+ *	two stages. The first stage counts the number of discrete, large chunks
+ *	of time that have elapsed. To this is added the time represented by
+ *	the remaining clock ticks. The advantage of this strategy is better
+ *	accuracy because the (ticks * clock_mult) product used for final
+ *	fractional chunk
+ *
+ *  iv) 64-bit arithmetic with the clock ticks to nsec conversion occuring in
+ *	two stages. This is carried out using locks to update the number of
+ *	large time chunks (MAX_CLOCK_SEC_2STAGE) that have elapsed.
+ *
+ *   v) 128-bit arithmetic used for the clock ticks to nsec conversion.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "lib/seqlock.h"
+
+#define DEBUG 0
+#define MAX_CLOCK_SEC 365*24*60*60ULL
+#define MAX_CLOCK_SEC_2STAGE 60*60ULL
+#define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); }
+
+enum {
+	__CLOCK64_BIT		= 1 << 0,
+	__CLOCK128_BIT		= 1 << 1,
+	__CLOCK_MULT_SHIFT	= 1 << 2,
+	__CLOCK_EMULATE_128	= 1 << 3,
+	__CLOCK_2STAGE		= 1 << 4,
+	__CLOCK_LOCK		= 1 << 5,
+
+	CLOCK64_MULT_SHIFT	= __CLOCK64_BIT | __CLOCK_MULT_SHIFT,
+	CLOCK64_EMULATE_128	= __CLOCK64_BIT | __CLOCK_EMULATE_128,
+	CLOCK64_2STAGE		= __CLOCK64_BIT | __CLOCK_2STAGE,
+	CLOCK64_LOCK		= __CLOCK64_BIT | __CLOCK_LOCK,
+	CLOCK128_MULT_SHIFT	= __CLOCK128_BIT | __CLOCK_MULT_SHIFT,
+};
+
+static struct seqlock clock_seqlock;
+static unsigned long long cycles_start;
+static unsigned long long elapsed_nsec;
+
+static unsigned int max_cycles_shift;
+static unsigned long long max_cycles_mask;
+static unsigned long long nsecs_for_max_cycles;
+
+static unsigned int clock_shift;
+static unsigned long long clock_mult;
+
+static unsigned long long *nsecs;
+static unsigned long long clock_mult64_128[2];
+static __uint128_t clock_mult128;
+
+/*
+ * Functions for carrying out 128-bit
+ * arithmetic using 64-bit integers
+ *
+ * 128-bit integers are stored as
+ * arrays of two 64-bit integers
+ *
+ * Ordering is little endian
+ *
+ * a[0] has the less significant bits
+ * a[1] has the more significant bits
+ *
+ * NOT FULLY IMPLEMENTED
+ */
+static void do_mult(unsigned long long a[2], unsigned long long b,
+		    unsigned long long product[2])
+{
+	product[0] = product[1] = 0;
+	return;
+}
+
+static void do_div(unsigned long long a[2], unsigned long long b,
+		   unsigned long long c[2])
+{
+	return;
+}
+
+static void do_shift64(unsigned long long a[2], unsigned int count)
+{
+	a[0] = a[1] >> (count-64);
+	a[1] = 0;
+}
+
+static void do_shift(unsigned long long a[2], unsigned int count)
+{
+	if (count > 64)
+		do_shift64(a, count);
+	else {
+		while (count--) {
+			a[0] >>= 1;
+			a[0] |= a[1] << 63;
+			a[1] >>= 1;
+		}
+	}
+}
+
+static void update_clock(unsigned long long t)
+{
+	write_seqlock_begin(&clock_seqlock);
+	elapsed_nsec = (t >> max_cycles_shift) * nsecs_for_max_cycles;
+	cycles_start = t & ~max_cycles_mask;
+	write_seqlock_end(&clock_seqlock);
+}
+
+static unsigned long long _get_nsec(int mode, unsigned long long t)
+{
+	switch(mode) {
+	case CLOCK64_MULT_SHIFT:
+		return (t * clock_mult) >> clock_shift;
+	case CLOCK64_EMULATE_128: {
+		unsigned long long product[2] =  { };
+
+		do_mult(clock_mult64_128, t, product);
+		do_shift(product, clock_shift);
+		return product[0];
+		}
+	case CLOCK64_2STAGE: {
+		unsigned long long multiples, nsec;
+
+		multiples = t >> max_cycles_shift;
+		dprintf("multiples=%llu\n", multiples);
+		nsec = multiples * nsecs_for_max_cycles;
+		nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
+		return nsec;
+		}
+	case CLOCK64_LOCK: {
+		unsigned int seq;
+		unsigned long long nsec;
+
+		do {
+			seq = read_seqlock_begin(&clock_seqlock);
+			nsec = elapsed_nsec;
+			nsec += ((t - cycles_start) * clock_mult) >> clock_shift;
+		} while (read_seqlock_retry(&clock_seqlock, seq));
+		return nsec;
+		}
+	case CLOCK128_MULT_SHIFT:
+		return (unsigned long long)((t * clock_mult128) >> clock_shift);
+		default:
+			assert(0);
+	}
+}
+
+static unsigned long long get_nsec(int mode, unsigned long long t)
+{
+	if (mode == CLOCK64_LOCK) {
+		update_clock(t);
+	}
+
+	return _get_nsec(mode, t);
+}
+
+static void calc_mult_shift(int mode, void *mult, unsigned int *shift,
+			    unsigned long long max_sec,
+			    unsigned long long cycles_per_usec)
+{
+	unsigned long long max_ticks;
+	max_ticks = max_sec * cycles_per_usec * 1000000ULL;
+
+	switch (mode) {
+	case CLOCK64_MULT_SHIFT: {
+		unsigned long long max_mult, tmp;
+		unsigned int sft = 0;
+
+		/*
+		 * Calculate the largest multiplier that will not
+		 * produce a 64-bit overflow in the multiplication
+		 * step of the clock ticks to nsec conversion
+		 */
+		max_mult = ULLONG_MAX / max_ticks;
+		dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult);
+
+		/*
+		 * Find the largest shift count that will produce
+		 * a multiplier less than max_mult
+		 */
+		tmp = max_mult * cycles_per_usec / 1000;
+		while (tmp > 1) {
+			tmp >>= 1;
+			sft++;
+			dprintf("tmp=%llu, sft=%u\n", tmp, sft);
+		}
+
+		*shift = sft;
+		*((unsigned long long *)mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec);
+		break;
+		}
+	case CLOCK64_EMULATE_128: {
+		unsigned long long max_mult[2], tmp[2] = { };
+		unsigned int sft = 0;
+
+		/*
+		 * Calculate the largest multiplier that will not
+		 * produce a 128-bit overflow in the multiplication
+		 * step of the clock ticks to nsec conversion,
+		 * but use only 64-bit integers in the process
+		 */
+		max_mult[0] = max_mult[1] = ULLONG_MAX;
+		do_div(max_mult, max_ticks, max_mult);
+		dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
+			max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]);
+
+		/*
+		 * Find the largest shift count that will produce
+		 * a multiplier less than max_mult
+		 */
+		do_div(max_mult, cycles_per_usec, tmp);
+		do_div(tmp, 1000ULL, tmp);
+		while (tmp[0] > 1 || tmp[1] > 1) {
+			do_shift(tmp, 1);
+			sft++;
+			dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft);
+		}
+
+		*shift = sft;
+//		*((unsigned long long *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
+		break;
+		}
+	case CLOCK64_2STAGE: {
+		unsigned long long tmp;
+/*
+ * This clock tick to nsec conversion requires two stages.
+ *
+ * Stage 1: Determine how many ~MAX_CLOCK_SEC_2STAGE periods worth of clock ticks
+ * 	have elapsed and set nsecs to the appropriate value for those
+ *	~MAX_CLOCK_SEC_2STAGE periods.
+ * Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC_2STAGE periods from
+ *	Stage 1. Convert remaining clock ticks to nsecs and add to previously
+ *	set nsec value.
+ *
+ * To optimize the arithmetic operations, use the greatest power of 2 ticks
+ * less than the number of ticks in MAX_CLOCK_SEC_2STAGE seconds.
+ *
+ */
+		// Use a period shorter than MAX_CLOCK_SEC here for better accuracy
+		calc_mult_shift(CLOCK64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC_2STAGE, cycles_per_usec);
+
+		// Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC_2STAGE
+		max_cycles_shift = max_cycles_mask = 0;
+		tmp = MAX_CLOCK_SEC_2STAGE * 1000000ULL * cycles_per_usec;
+		dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+		while (tmp > 1) {
+			tmp >>= 1;
+			max_cycles_shift++;
+			dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+		}
+		// if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will
+		// have a discontinuity every (1ULL << max_cycles_shift) cycles
+		nsecs_for_max_cycles = (1ULL << max_cycles_shift) * *((unsigned long long *)mult) >> *shift;
+
+		// Use a bitmask to calculate ticks % (1ULL << max_cycles_shift)
+		for (tmp = 0; tmp < max_cycles_shift; tmp++)
+			max_cycles_mask |= 1ULL << tmp;
+
+		dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n",
+				max_cycles_shift, (1ULL << max_cycles_shift),
+				nsecs_for_max_cycles, max_cycles_mask);
+
+
+		break;
+		}
+	case CLOCK64_LOCK: {
+/*
+ * This clock tick to nsec conversion also requires two stages.
+ *
+ * Stage 1: Add to nsec the current running total of elapsed long periods
+ * Stage 2: Subtract from clock ticks the tick count corresponding to the
+ *	most recently elapsed long period. Convert the remaining ticks to
+ *	nsec and add to the previous nsec value.
+ *
+ * In practice the elapsed nsec from Stage 1 and the tick count subtracted
+ * in Stage 2 will be maintained in a separate thread.
+ *
+ */
+		calc_mult_shift(CLOCK64_2STAGE, mult, shift, MAX_CLOCK_SEC, cycles_per_usec);
+		cycles_start = 0;
+		break;
+		}
+	case CLOCK128_MULT_SHIFT: {
+		__uint128_t max_mult, tmp;
+		unsigned int sft = 0;
+
+		/*
+		 * Calculate the largest multiplier that will not
+		 * produce a 128-bit overflow in the multiplication
+		 * step of the clock ticks to nsec conversion
+		 */
+		max_mult = ((__uint128_t) ULLONG_MAX) << 64 | ULLONG_MAX;
+		max_mult /= max_ticks;
+		dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
+				max_ticks, __builtin_clzll(max_ticks),
+				(unsigned long long) (max_mult >> 64),
+				(unsigned long long) max_mult);
+
+		/*
+		 * Find the largest shift count that will produce
+		 * a multiplier less than max_mult
+		 */
+		tmp = max_mult * cycles_per_usec / 1000;
+		while (tmp > 1) {
+			tmp >>= 1;
+			sft++;
+			dprintf("tmp=0x%016llx%016llx, sft=%u\n",
+					(unsigned long long) (tmp >> 64),
+					(unsigned long long) tmp, sft);
+		}
+
+		*shift = sft;
+		*((__uint128_t *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
+		break;
+		}
+	}
+}
+
+static int discontinuity(int mode, int delta_ticks, int delta_nsec,
+			 unsigned long long start, unsigned long len)
+{
+	int i;
+	unsigned long mismatches = 0, bad_mismatches = 0;
+	unsigned long long delta, max_mismatch = 0;
+	unsigned long long *ns = nsecs;
+
+	for (i = 0; i < len; ns++, i++) {
+		*ns = get_nsec(mode, start + i);
+		if (i - delta_ticks >= 0) {
+			if (*ns > *(ns - delta_ticks))
+				delta = *ns - *(ns - delta_ticks);
+			else
+				delta = *(ns - delta_ticks) - *ns;
+			if (delta > delta_nsec)
+				delta -= delta_nsec;
+			else
+				delta = delta_nsec - delta;
+			if (delta) {
+				mismatches++;
+				if (delta > 1)
+					bad_mismatches++;
+				if (delta > max_mismatch)
+					max_mismatch = delta;
+			}
+		}
+		if (!bad_mismatches)
+			assert(max_mismatch == 0 || max_mismatch == 1);
+		if (!mismatches)
+			assert(max_mismatch == 0);
+	}
+
+	printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n",
+		mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start,
+		start + len - 1);
+	return mismatches;
+}
+
+#define MIN_TICKS 1ULL
+#define LEN 1000000000ULL
+#define NSEC_ONE_SEC 1000000000ULL
+#define TESTLEN 9
+
+static long long test_clock(int mode, int cycles_per_usec, int fast_test,
+			    int quiet, int delta_ticks, int delta_nsec)
+{
+	int i;
+	long long delta;
+	unsigned long long max_ticks;
+	unsigned long long nsecs;
+	void *mult;
+	unsigned long long test_ns[TESTLEN] =
+			{NSEC_ONE_SEC, NSEC_ONE_SEC,
+			 NSEC_ONE_SEC, NSEC_ONE_SEC*60, NSEC_ONE_SEC*60*60,
+			 NSEC_ONE_SEC*60*60*2, NSEC_ONE_SEC*60*60*4,
+			 NSEC_ONE_SEC*60*60*8, NSEC_ONE_SEC*60*60*24};
+	unsigned long long test_ticks[TESTLEN];
+
+	max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL;
+
+	switch(mode) {
+	case CLOCK64_MULT_SHIFT:
+		mult = &clock_mult;
+		break;
+	case CLOCK64_EMULATE_128:
+		mult = clock_mult64_128;
+		break;
+	case CLOCK64_2STAGE:
+		mult = &clock_mult;
+		break;
+	case CLOCK64_LOCK:
+		mult = &clock_mult;
+		break;
+	case CLOCK128_MULT_SHIFT:
+		mult = &clock_mult128;
+		break;
+	default:
+		assert(0);
+	}
+	calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec);
+	nsecs = get_nsec(mode, max_ticks);
+	delta = nsecs/1000000 - MAX_CLOCK_SEC*1000;
+
+	if (mode == CLOCK64_2STAGE) {
+		test_ns[0] = nsecs_for_max_cycles - 1;
+		test_ns[1] = nsecs_for_max_cycles;
+		test_ticks[0] = (1ULL << max_cycles_shift) - 1;
+		test_ticks[1] = (1ULL << max_cycles_shift);
+
+		for (i = 2; i < TESTLEN; i++)
+			test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
+	}
+	else {
+		for (i = 0; i < TESTLEN; i++)
+			test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
+	}
+
+	if (!quiet) {
+		printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n",
+			cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift));
+		switch(mode) {
+			case CLOCK64_LOCK:
+			case CLOCK64_2STAGE:
+			case CLOCK64_MULT_SHIFT: {
+				printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n",
+					clock_mult, (double) clock_mult / (1ULL << clock_shift));
+				break;
+			}
+			case CLOCK64_EMULATE_128: {
+				printf("clock_mult=0x%016llx%016llx\n",
+					clock_mult64_128[1], clock_mult64_128[0]);
+				break;
+			}
+			case CLOCK128_MULT_SHIFT: {
+				printf("clock_mult=0x%016llx%016llx\n",
+					(unsigned long long) (clock_mult128 >> 64),
+					(unsigned long long) clock_mult128);
+				break;
+			}
+		}
+		printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n",
+			nsecs, MAX_CLOCK_SEC*1000000000ULL, delta);
+	}
+
+	for (i = 0; i < TESTLEN; i++)
+	{
+		nsecs = get_nsec(mode, test_ticks[i]);
+		delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs;
+		if (!quiet || delta > 0)
+			printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n",
+				test_ticks[i], nsecs, test_ns[i], delta);
+	}
+
+	if (!fast_test) {
+		discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN);
+		discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN);
+	}
+
+	if (!quiet)
+		printf("\n\n");
+
+	return delta;
+}
+
+int main(int argc, char *argv[])
+{
+	nsecs = malloc(LEN * sizeof(unsigned long long));
+
+	test_clock(CLOCK64_LOCK, 3333, 1, 0, 0, 0);
+	test_clock(CLOCK64_LOCK, 1000, 1, 0, 1, 1);
+	test_clock(CLOCK64_LOCK, 1100, 1, 0, 11, 10);
+	test_clock(CLOCK64_LOCK, 3000, 1, 0, 3, 1);
+	test_clock(CLOCK64_LOCK, 3333, 1, 0, 3333, 1000);
+	test_clock(CLOCK64_LOCK, 3392, 1, 0, 424, 125);
+	test_clock(CLOCK64_LOCK, 4500, 1, 0, 9, 2);
+	test_clock(CLOCK64_LOCK, 5000, 1, 0, 5, 1);
+
+	free(nsecs);
+	return 0;
+}
diff -Nru fio-2.16/t/verify-state.c fio-3.1/t/verify-state.c
--- fio-2.16/t/verify-state.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/t/verify-state.c	2017-09-28 10:23:20.000000000 +0000
@@ -58,7 +58,8 @@
 		show_s(s, no_s);
 		no_s++;
 		size -= __thread_io_list_sz(s->depth, s->nofiles);
-		s = (void *) s + __thread_io_list_sz(s->depth, s->nofiles);
+		s = (struct thread_io_list *)((char *) s +
+			__thread_io_list_sz(s->depth, s->nofiles));
 	} while (size != 0);
 }
 
diff -Nru fio-2.16/td_error.c fio-3.1/td_error.c
--- fio-2.16/td_error.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/td_error.c	2017-09-28 10:23:20.000000000 +0000
@@ -20,8 +20,7 @@
 
 	if (!td->o.ignore_error[etype]) {
 		td->o.ignore_error[etype] = __NON_FATAL_ERR;
-		td->o.ignore_error_nr[etype] = sizeof(__NON_FATAL_ERR)
-			/ sizeof(int);
+		td->o.ignore_error_nr[etype] = ARRAY_SIZE(__NON_FATAL_ERR);
 	}
 
 	if (!(td->o.continue_on_error & (1 << etype)))
diff -Nru fio-2.16/td_error.h fio-3.1/td_error.h
--- fio-2.16/td_error.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/td_error.h	2017-09-28 10:23:20.000000000 +0000
@@ -2,7 +2,8 @@
 #define FIO_TD_ERROR_H
 
 /*
- * What type of errors to continue on when continue_on_error is used
+ * What type of errors to continue on when continue_on_error is used,
+ * and what type of errors to ignore when ignore_error is used.
  */
 enum error_type_bit {
 	ERROR_TYPE_READ_BIT = 0,
diff -Nru fio-2.16/thread_options.h fio-3.1/thread_options.h
--- fio-2.16/thread_options.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/thread_options.h	2017-09-28 10:23:20.000000000 +0000
@@ -20,6 +20,7 @@
 	MEM_MMAP,	/* use anonynomous mmap */
 	MEM_MMAPHUGE,	/* memory mapped huge file */
 	MEM_MMAPSHARED, /* use mmap with shared flag */
+	MEM_CUDA_MALLOC,/* use GPU memory */
 };
 
 #define ERROR_STR_MAX	128
@@ -52,6 +53,7 @@
 	char *filename_format;
 	char *opendir;
 	char *ioengine;
+	char *ioengine_so_path;
 	char *mmapfile;
 	enum td_ddir td_ddir;
 	unsigned int rw_seq;
@@ -64,11 +66,12 @@
 	unsigned int iodepth_batch;
 	unsigned int iodepth_batch_complete_min;
 	unsigned int iodepth_batch_complete_max;
+	unsigned int serialize_overlap;
 
 	unsigned int unique_filename;
 
 	unsigned long long size;
-	unsigned long long io_limit;
+	unsigned long long io_size;
 	unsigned int size_percent;
 	unsigned int fill_device;
 	unsigned int file_append;
@@ -101,6 +104,7 @@
 	unsigned int end_fsync;
 	unsigned int pre_read;
 	unsigned int sync_io;
+	unsigned int write_hint;
 	unsigned int verify;
 	unsigned int do_verify;
 	unsigned int verifysort;
@@ -198,6 +202,9 @@
 	unsigned short numa_mem_mode;
 	unsigned int numa_mem_prefer_node;
 	char *numa_memnodes;
+	unsigned int gpu_dev_id;
+	unsigned int start_offset_percent;
+
 	unsigned int iolog;
 	unsigned int rwmixcycle;
 	unsigned int rwmix[DDIR_RWDIR_CNT];
@@ -206,6 +213,7 @@
 	unsigned int ioprio_class;
 	unsigned int file_service_type;
 	unsigned int group_reporting;
+	unsigned int stats;
 	unsigned int fadvise_hint;
 	unsigned int fadvise_stream;
 	enum fio_fallocate_mode fallocate_mode;
@@ -232,6 +240,7 @@
 	unsigned int trim_zero;
 	unsigned long long trim_backlog;
 	unsigned int clat_percentiles;
+	unsigned int lat_percentiles;
 	unsigned int percentile_precision;	/* digits after decimal for percentiles */
 	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
 
@@ -300,7 +309,6 @@
 	fio_fp64_t latency_percentile;
 
 	unsigned block_error_hist;
-	unsigned int skip_bad;
 
 	unsigned int replay_align;
 	unsigned int replay_scale;
@@ -335,10 +343,11 @@
 	uint32_t iodepth_batch;
 	uint32_t iodepth_batch_complete_min;
 	uint32_t iodepth_batch_complete_max;
-	uint32_t __proper_alignment_for_64b;
+	uint32_t serialize_overlap;
+	uint32_t lat_percentiles;
 
 	uint64_t size;
-	uint64_t io_limit;
+	uint64_t io_size;
 	uint32_t size_percent;
 	uint32_t fill_device;
 	uint32_t file_append;
@@ -372,6 +381,7 @@
 	uint32_t end_fsync;
 	uint32_t pre_read;
 	uint32_t sync_io;
+	uint32_t write_hint;
 	uint32_t verify;
 	uint32_t do_verify;
 	uint32_t verifysort;
@@ -410,10 +420,10 @@
 	uint32_t bs_unaligned;
 	uint32_t fsync_on_close;
 	uint32_t bs_is_seq_rand;
-	uint32_t pad1;
 
 	uint32_t random_distribution;
 	uint32_t exitall_error;
+	uint32_t pad;
 
 	struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
 	uint32_t zone_split_nr[DDIR_RWDIR_CNT];
@@ -466,6 +476,8 @@
 	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
 	uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
 #endif
+	uint32_t gpu_dev_id;
+	uint32_t start_offset_percent;
 	uint32_t cpus_allowed_policy;
 	uint32_t iolog;
 	uint32_t rwmixcycle;
@@ -475,6 +487,7 @@
 	uint32_t ioprio_class;
 	uint32_t file_service_type;
 	uint32_t group_reporting;
+	uint32_t stats;
 	uint32_t fadvise_hint;
 	uint32_t fadvise_stream;
 	uint32_t fallocate_mode;
@@ -502,7 +515,6 @@
 	uint64_t trim_backlog;
 	uint32_t clat_percentiles;
 	uint32_t percentile_precision;
-	uint32_t padding;	/* REMOVE ME when possible to maintain alignment */
 	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
 
 	uint8_t read_iolog_file[FIO_TOP_STR_MAX];
@@ -571,7 +583,6 @@
 	fio_fp64_t latency_percentile;
 
 	uint32_t block_error_hist;
-	uint32_t skip_bad;
 
 	uint32_t replay_align;
 	uint32_t replay_scale;
diff -Nru fio-2.16/time.c fio-3.1/time.c
--- fio-2.16/time.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/time.c	2017-09-28 10:23:20.000000000 +0000
@@ -3,15 +3,23 @@
 
 #include "fio.h"
 
-static struct timeval genesis;
+static struct timespec genesis;
 static unsigned long ns_granularity;
 
-void timeval_add_msec(struct timeval *tv, unsigned int msec)
+void timespec_add_msec(struct timespec *ts, unsigned int msec)
 {
-	tv->tv_usec += 1000 * msec;
-	if (tv->tv_usec >= 1000000) {
-		tv->tv_usec -= 1000000;
-		tv->tv_sec++;
+	uint64_t adj_nsec = 1000000ULL * msec;
+
+	ts->tv_nsec += adj_nsec;
+	if (adj_nsec >= 1000000000) {
+		uint64_t adj_sec = adj_nsec / 1000000000;
+
+		ts->tv_nsec -= adj_sec * 1000000000;
+		ts->tv_sec += adj_sec;
+	}
+	if (ts->tv_nsec >= 1000000000){
+		ts->tv_nsec -= 1000000000;
+		ts->tv_sec++;
 	}
 }
 
@@ -20,7 +28,7 @@
  */
 uint64_t usec_spin(unsigned int usec)
 {
-	struct timeval start;
+	struct timespec start;
 	uint64_t t;
 
 	fio_gettime(&start, NULL);
@@ -33,7 +41,7 @@
 uint64_t usec_sleep(struct thread_data *td, unsigned long usec)
 {
 	struct timespec req;
-	struct timeval tv;
+	struct timespec tv;
 	uint64_t t = 0;
 
 	do {
@@ -89,31 +97,37 @@
 	return td->o.ramp_time && !td->ramp_time_over;
 }
 
-static void parent_update_ramp(struct thread_data *td)
+static bool parent_update_ramp(struct thread_data *td)
 {
 	struct thread_data *parent = td->parent;
 
 	if (!parent || parent->ramp_time_over)
-		return;
+		return false;
 
 	reset_all_stats(parent);
-	parent->ramp_time_over = 1;
+	parent->ramp_time_over = true;
 	td_set_runstate(parent, TD_RAMP);
+	return true;
 }
 
 bool ramp_time_over(struct thread_data *td)
 {
-	struct timeval tv;
-
 	if (!td->o.ramp_time || td->ramp_time_over)
 		return true;
 
-	fio_gettime(&tv, NULL);
-	if (utime_since(&td->epoch, &tv) >= td->o.ramp_time) {
-		td->ramp_time_over = 1;
+	if (utime_since_now(&td->epoch) >= td->o.ramp_time) {
+		td->ramp_time_over = true;
 		reset_all_stats(td);
 		td_set_runstate(td, TD_RAMP);
-		parent_update_ramp(td);
+
+		/*
+		 * If we have a parent, the parent isn't doing IO. Hence
+		 * the parent never enters do_io(), which will switch us
+		 * from RAMP -> RUNNING. Do this manually here.
+		 */
+		if (parent_update_ramp(td))
+			td_set_runstate(td, TD_RUNNING);
+
 		return true;
 	}
 
@@ -130,8 +144,7 @@
 	 * Check the granularity of the nanosleep function
 	 */
 	for (i = 0; i < 10; i++) {
-		struct timeval tv;
-		struct timespec ts;
+		struct timespec tv, ts;
 		unsigned long elapsed;
 
 		fio_gettime(&tv, NULL);
@@ -162,7 +175,7 @@
 	}
 }
 
-void fill_start_time(struct timeval *t)
+void fill_start_time(struct timespec *t)
 {
 	memcpy(t, &genesis, sizeof(genesis));
 }
diff -Nru fio-2.16/tools/fio_jsonplus_clat2csv fio-3.1/tools/fio_jsonplus_clat2csv
--- fio-2.16/tools/fio_jsonplus_clat2csv	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.1/tools/fio_jsonplus_clat2csv	2017-09-28 10:23:20.000000000 +0000
@@ -0,0 +1,164 @@
+#!/usr/bin/python
+#
+# fio_jsonplus_clat2csv
+#
+# This script converts fio's json+ completion latency data to CSV format.
+#
+# For example:
+#
+# Run the following fio jobs:
+# ../fio --output=fio-jsonplus.output --output-format=json+ --name=test1
+#  	--ioengine=null --time_based --runtime=5s --size=1G --rw=randrw
+# 	--name=test2 --ioengine=null --time_based --runtime=3s --size=1G
+# 	--rw=read --name=test3 --ioengine=null --time_based --runtime=4s
+# 	--size=8G --rw=write
+#
+# Then run:
+# fio_jsonplus_clat2csv fio-jsonplus.output fio-latency.csv
+#
+# You will end up with the following 3 files
+#
+# -rw-r--r-- 1 root root  6467 Jun 27 14:57 fio-latency_job0.csv
+# -rw-r--r-- 1 root root  3985 Jun 27 14:57 fio-latency_job1.csv
+# -rw-r--r-- 1 root root  4490 Jun 27 14:57 fio-latency_job2.csv
+#
+# fio-latency_job0.csv will look something like:
+#
+# clat_nsec, read_count, read_cumulative, read_percentile, write_count,
+# 	write_cumulative, write_percentile, trim_count, trim_cumulative,
+# 	trim_percentile,
+# 25, 1, 1, 1.50870705013e-07, , , , , , ,
+# 26, 12, 13, 1.96131916517e-06, 947, 947, 0.000142955890032, , , ,
+# 27, 843677, 843690, 0.127288105112, 838347, 839294, 0.126696959629, , , ,
+# 28, 1877982, 2721672, 0.410620573454, 1870189, 2709483, 0.409014312345, , , ,
+# 29, 4471, 2726143, 0.411295116376, 7718, 2717201, 0.410179395301, , , ,
+# 30, 2142885, 4869028, 0.734593687087, 2138164, 4855365, 0.732949340025, , , ,
+# ...
+# 2544, , , , 2, 6624404, 0.999997433738, , , ,
+# 2576, 3, 6628178, 0.99999788781, 4, 6624408, 0.999998037564, , , ,
+# 2608, 4, 6628182, 0.999998491293, 4, 6624412, 0.999998641391, , , ,
+# 2640, 3, 6628185, 0.999998943905, 2, 6624414, 0.999998943304, , , ,
+# 2672, 1, 6628186, 0.999999094776, 3, 6624417, 0.999999396174, , , ,
+# 2736, 1, 6628187, 0.999999245646, 1, 6624418, 0.99999954713, , , ,
+# 2768, 2, 6628189, 0.999999547388, 1, 6624419, 0.999999698087, , , ,
+# 2800, , , , 1, 6624420, 0.999999849043, , , ,
+# 2832, 1, 6628190, 0.999999698259, , , , , , ,
+# 4192, 1, 6628191, 0.999999849129, , , , , , ,
+# 5792, , , , 1, 6624421, 1.0, , , ,
+# 10304, 1, 6628192, 1.0, , , , , , ,
+#
+# The first line says that you had one read IO with 25ns clat,
+# the cumulative number of read IOs at or below 25ns is 1, and
+# 25ns is the 0.00001509th percentile for read latency
+#
+# The job had 2 write IOs complete in 2544ns,
+# 6624404 write IOs completed in 2544ns or less,
+# and this represents the 99.99974th percentile for write latency
+#
+# The last line says that one read IO had 10304ns clat,
+# 6628192 read IOs had 10304ns or shorter clat, and
+# 10304ns is the 100th percentile for read latency
+#
+
+import os
+import json
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('source',
+                        help='fio json+ output file containing completion '
+                             'latency data')
+    parser.add_argument('dest',
+                        help='destination file stub for latency data in CSV '
+                             'format. job number will be appended to filename')
+    args = parser.parse_args()
+
+    return args
+
+
+def percentile(idx, run_total):
+    total = run_total[len(run_total)-1]
+    if total == 0:
+        return 0
+
+    return float(run_total[idx]) / total
+
+
+def more_lines(indices, bins):
+    for key, value in indices.iteritems():
+        if value < len(bins[key]):
+            return True
+
+    return False
+
+
+def main():
+    args = parse_args()
+
+    with open(args.source, 'r') as source:
+        jsondata = json.loads(source.read())
+
+    for jobnum in range(0, len(jsondata['jobs'])):
+        bins = {}
+        run_total = {}
+        ddir_set = set(['read', 'write', 'trim'])
+
+        prev_ddir = None
+        for ddir in ddir_set:
+            bins[ddir] = [[int(key), value] for key, value in
+                          jsondata['jobs'][jobnum][ddir]['clat_ns']
+                          ['bins'].iteritems()]
+            bins[ddir] = sorted(bins[ddir], key=lambda bin: bin[0])
+
+            run_total[ddir] = [0 for x in range(0, len(bins[ddir]))]
+            if len(bins[ddir]) > 0:
+                run_total[ddir][0] = bins[ddir][0][1]
+                for x in range(1, len(bins[ddir])):
+                    run_total[ddir][x] = run_total[ddir][x-1] + \
+                        bins[ddir][x][1]
+
+        stub, ext = os.path.splitext(args.dest)
+        outfile = stub + '_job' + str(jobnum) + ext
+
+        with open(outfile, 'w') as output:
+            output.write("clat_nsec, ")
+            ddir_list = list(ddir_set)
+            for ddir in ddir_list:
+                output.write("{0}_count, {0}_cumulative, {0}_percentile, ".
+                             format(ddir))
+            output.write("\n")
+
+#
+# Have a counter for each ddir
+# In each round, pick the shortest remaining duration
+# and output a line with any values for that duration
+#
+            indices = {x: 0 for x in ddir_list}
+            while more_lines(indices, bins):
+                min_lat = 17112760320
+                for ddir in ddir_list:
+                    if indices[ddir] < len(bins[ddir]):
+                        min_lat = min(bins[ddir][indices[ddir]][0], min_lat)
+
+                output.write("{0}, ".format(min_lat))
+
+                for ddir in ddir_list:
+                    if indices[ddir] < len(bins[ddir]) and \
+                       min_lat == bins[ddir][indices[ddir]][0]:
+                        count = bins[ddir][indices[ddir]][1]
+                        cumulative = run_total[ddir][indices[ddir]]
+                        ptile = percentile(indices[ddir], run_total[ddir])
+                        output.write("{0}, {1}, {2}, ".format(count,
+                                     cumulative, ptile))
+                        indices[ddir] += 1
+                    else:
+                        output.write(", , , ")
+                output.write("\n")
+
+            print "{0} generated".format(outfile)
+
+
+if __name__ == '__main__':
+    main()
diff -Nru fio-2.16/tools/fio_latency2csv.py fio-3.1/tools/fio_latency2csv.py
--- fio-2.16/tools/fio_latency2csv.py	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/tools/fio_latency2csv.py	1970-01-01 00:00:00.000000000 +0000
@@ -1,101 +0,0 @@
-#!/usr/bin/python
-#
-# fio_latency2csv.py
-#
-# This tool converts fio's json+ completion latency data to CSV format.
-# For example:
-#
-# fio_latency2csv.py fio-jsonplus.output fio-latency.csv
-#
-
-import os
-import json
-import argparse
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('source',
-                        help='fio json+ output file containing completion '
-                             'latency data')
-    parser.add_argument('dest',
-                        help='destination file stub for latency data in CSV '
-                             'format. job number will be appended to filename')
-    args = parser.parse_args()
-
-    return args
-
-
-# from stat.c
-def plat_idx_to_val(idx, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64):
-    # MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
-    # all bits of the sample as index
-    if (idx < (FIO_IO_U_PLAT_VAL << 1)):
-        return idx
-
-    # Find the group and compute the minimum value of that group
-    error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1
-    base = 1 << (error_bits + FIO_IO_U_PLAT_BITS)
-
-    # Find its bucket number of the group
-    k = idx % FIO_IO_U_PLAT_VAL
-
-    # Return the mean of the range of the bucket
-    return (base + ((k + 0.5) * (1 << error_bits)))
-
-
-def percentile(idx, run_total):
-    total = run_total[len(run_total)-1]
-    if total == 0:
-        return 0
-
-    return float(run_total[x]) / total
-
-
-if __name__ == '__main__':
-    args = parse_args()
-
-    with open(args.source, 'r') as source:
-        jsondata = json.loads(source.read())
-
-    bins = {}
-    bin_const = {}
-    run_total = {}
-    ddir_list = ['read', 'write', 'trim']
-    const_list = ['FIO_IO_U_PLAT_NR', 'FIO_IO_U_PLAT_BITS',
-                  'FIO_IO_U_PLAT_VAL']
-
-    for jobnum in range(0,len(jsondata['jobs'])):
-        prev_ddir = None
-        for ddir in ddir_list:
-            bins[ddir] = jsondata['jobs'][jobnum][ddir]['clat']['bins']
-
-            bin_const[ddir] = {}
-            for const in const_list:
-                bin_const[ddir][const] = bins[ddir].pop(const)
-                if prev_ddir:
-                    assert bin_const[ddir][const] == bin_const[prev_ddir][const]
-            prev_ddir = ddir
-
-            run_total[ddir] = [0 for x in
-                               range(bin_const[ddir]['FIO_IO_U_PLAT_NR'])]
-            run_total[ddir][0] = bins[ddir]['0']
-            for x in range(1, bin_const[ddir]['FIO_IO_U_PLAT_NR']):
-                run_total[ddir][x] = run_total[ddir][x-1] + bins[ddir][str(x)]
-        
-        stub, ext = os.path.splitext(args.dest)
-        outfile = stub + '_job' + str(jobnum) + ext
-
-        with open(outfile, 'w') as output:
-            output.write("clat (usec),")
-            for ddir in ddir_list:
-                output.write("{0},".format(ddir))
-            output.write("\n")
-
-            for x in range(bin_const['read']['FIO_IO_U_PLAT_NR']):
-                output.write("{0},".format(plat_idx_to_val(x,
-                                          bin_const['read']['FIO_IO_U_PLAT_BITS'],
-                                          bin_const['read']['FIO_IO_U_PLAT_VAL'])))
-                for ddir in ddir_list:
-                    output.write("{0},".format(percentile(x, run_total[ddir])))
-                output.write("\n")
diff -Nru fio-2.16/tools/fiologparser.py fio-3.1/tools/fiologparser.py
--- fio-2.16/tools/fiologparser.py	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/tools/fiologparser.py	2017-09-28 10:23:20.000000000 +0000
@@ -45,7 +45,7 @@
     while (start < ftime):
         end = ftime if ftime < end else end
         results = [ts.get_value(start, end) for ts in series]
-        print "%s, %s" % (end, ', '.join(["%0.3f" % i for i in results]))
+        print("%s, %s" % (end, ', '.join(["%0.3f" % i for i in results])))
         start += ctx.interval
         end += ctx.interval
 
@@ -57,7 +57,7 @@
     while (start < ftime):
         end = ftime if ftime < end else end
         results = [ts.get_value(start, end) for ts in series]
-        print "%s, %0.3f" % (end, sum(results))
+        print("%s, %0.3f" % (end, sum(results)))
         start += ctx.interval
         end += ctx.interval
 
@@ -69,7 +69,7 @@
     while (start < ftime):
         end = ftime if ftime < end else end
         results = [ts.get_value(start, end) for ts in series]
-        print "%s, %0.3f" % (end, float(sum(results))/len(results))
+        print("%s, %0.3f" % (end, float(sum(results))/len(results)))
         start += ctx.interval
         end += ctx.interval
 
@@ -147,11 +147,11 @@
         end += ctx.interval
 
     total = 0
-    for i in xrange(0, len(averages)):
+    for i in range(0, len(averages)):
         total += averages[i]*weights[i]
-    print '%0.3f' % (total/sum(weights))
+    print('%0.3f' % (total/sum(weights)))
  
-class TimeSeries():
+class TimeSeries(object):
     def __init__(self, ctx, fn):
         self.ctx = ctx
         self.last = None 
@@ -185,7 +185,7 @@
             value += sample.get_contribution(start, end)
         return value
 
-class Sample():
+class Sample(object):
     def __init__(self, ctx, start, end, value):
        self.ctx = ctx
        self.start = start
diff -Nru fio-2.16/tools/hist/fiologparser_hist.py fio-3.1/tools/hist/fiologparser_hist.py
--- fio-2.16/tools/hist/fiologparser_hist.py	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/tools/hist/fiologparser_hist.py	2017-09-28 10:23:20.000000000 +0000
@@ -373,7 +373,7 @@
         help='print warning messages to stderr')
 
     arg('--group_nr',
-        default=19,
+        default=29,
         type=int,
         help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')
 
diff -Nru fio-2.16/tools/plot/fio2gnuplot.1 fio-3.1/tools/plot/fio2gnuplot.1
--- fio-2.16/tools/plot/fio2gnuplot.1	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/tools/plot/fio2gnuplot.1	2017-09-28 10:23:20.000000000 +0000
@@ -1,5 +1,5 @@
 .\" Text automatically generated by txt2man
-.TH fio2gnuplot  "07 août 2013" "" ""
+.TH fio2gnuplot 1 "August 2013"
 .SH NAME
 \fBfio2gnuplot \fP- Render fio's output files with gnuplot
 .SH SYNOPSIS
diff -Nru fio-2.16/.travis.yml fio-3.1/.travis.yml
--- fio-2.16/.travis.yml	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/.travis.yml	2017-09-28 10:23:20.000000000 +0000
@@ -1,7 +1,51 @@
 language: c
+os:
+  - linux
 compiler:
   - clang
   - gcc
+env:
+  matrix:
+    - BUILD_ARCH="x86"
+    - BUILD_ARCH="x86_64"
+  global:
+    - MAKEFLAGS="-j 2"
+matrix:
+  include:
+    - os: osx
+      compiler: clang # Workaround travis setting CC=["clang", "gcc"]
+      env: BUILD_ARCH="x86_64"
+    # Build using the 10.12 SDK but target and run on OSX 10.11
+#   - os: osx
+#     compiler: clang
+#     osx_image: xcode8
+#     env: SDKROOT=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk MACOSX_DEPLOYMENT_TARGET=10.11
+    # Build on the latest OSX version (will eventually become obsolete)
+    - os: osx
+      compiler: clang
+      osx_image: xcode8.3
+      env: BUILD_ARCH="x86_64"
+  exclude:
+    - os: osx
+      compiler: gcc
+  exclude:
+    - os: linux
+      compiler: clang
+      env: BUILD_ARCH="x86" # Only do the gcc x86 build to reduce clutter
 before_install:
-  - sudo apt-get -qq update
-  - sudo apt-get install -qq -y libaio-dev libnuma-dev libz-dev
+  - EXTRA_CFLAGS="-Werror"
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
+        pkgs=(libaio-dev libnuma-dev libz-dev librbd-dev libibverbs-dev librdmacm-dev);
+        if [[ "$BUILD_ARCH" == "x86" ]]; then
+            pkgs=("${pkgs[@]/%/:i386}");
+            pkgs+=(gcc-multilib);
+            EXTRA_CFLAGS="${EXTRA_CFLAGS} -m32";
+        else
+            pkgs+=(glusterfs-common);
+        fi;
+        sudo apt-get -qq update;
+        sudo apt-get install --no-install-recommends -qq -y "${pkgs[@]}";
+    fi
+script:
+  - ./configure --extra-cflags="${EXTRA_CFLAGS}" && make
+  - make test
diff -Nru fio-2.16/unit_tests/steadystate_tests.py fio-3.1/unit_tests/steadystate_tests.py
--- fio-2.16/unit_tests/steadystate_tests.py	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/unit_tests/steadystate_tests.py	2017-09-28 10:23:20.000000000 +0000
@@ -115,7 +115,7 @@
     if args.read == None:
         if os.name == 'posix':
             args.read = '/dev/zero'
-            extra = [ "--size=128M" ]
+            extra = [ "--size=134217728" ]  # 128 MiB
         else:
             print "ERROR: file for read testing must be specified on non-posix systems"
             sys.exit(1)
diff -Nru fio-2.16/verify.c fio-3.1/verify.c
--- fio-2.16/verify.c	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/verify.c	2017-09-28 10:23:20.000000000 +0000
@@ -25,6 +25,7 @@
 #include "crc/sha512.h"
 #include "crc/sha1.h"
 #include "crc/xxhash.h"
+#include "crc/sha3.h"
 
 static void populate_hdr(struct thread_data *td, struct io_u *io_u,
 			 struct verify_header *hdr, unsigned int header_num,
@@ -172,6 +173,18 @@
 	case VERIFY_SHA512:
 		len = sizeof(struct vhdr_sha512);
 		break;
+	case VERIFY_SHA3_224:
+		len = sizeof(struct vhdr_sha3_224);
+		break;
+	case VERIFY_SHA3_256:
+		len = sizeof(struct vhdr_sha3_256);
+		break;
+	case VERIFY_SHA3_384:
+		len = sizeof(struct vhdr_sha3_384);
+		break;
+	case VERIFY_SHA3_512:
+		len = sizeof(struct vhdr_sha3_512);
+		break;
 	case VERIFY_XXHASH:
 		len = sizeof(struct vhdr_xxhash);
 		break;
@@ -258,6 +271,7 @@
 	fd = open(fname, O_CREAT | O_TRUNC | O_WRONLY, 0644);
 	if (fd < 0) {
 		perror("open verify buf file");
+		free(ptr);
 		return;
 	}
 
@@ -374,7 +388,7 @@
 	(void)paste_format_inplace(pattern, pattern_size,
 				   td->o.verify_fmt, td->o.verify_fmt_sz, io_u);
 
-	buf = (void *) hdr + header_size;
+	buf = (char *) hdr + header_size;
 	len = get_hdr_inc(td, io_u) - header_size;
 	mod = (get_hdr_inc(td, io_u) * vc->hdr_num + header_size) % pattern_size;
 
@@ -393,7 +407,8 @@
 				(unsigned char)pattern[mod],
 				bits);
 			log_err("fio: bad pattern block offset %u\n", i);
-			dump_verify_buffers(hdr, vc);
+			vc->name = "pattern";
+			log_verify_failure(hdr, vc);
 			return EILSEQ;
 		}
 		mod++;
@@ -430,6 +445,84 @@
 	return EILSEQ;
 }
 
+static int verify_io_u_sha3(struct verify_header *hdr, struct vcont *vc,
+			    struct fio_sha3_ctx *sha3_ctx, uint8_t *sha,
+			    unsigned int sha_size, const char *name)
+{
+	void *p = io_u_verify_off(hdr, vc);
+
+	dprint(FD_VERIFY, "%s verify io_u %p, len %u\n", name, vc->io_u, hdr->len);
+
+	fio_sha3_update(sha3_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+	fio_sha3_final(sha3_ctx);
+
+	if (!memcmp(sha, sha3_ctx->sha, sha_size))
+		return 0;
+
+	vc->name = name;
+	vc->good_crc = sha;
+	vc->bad_crc = sha3_ctx->sha;
+	vc->crc_len = sha_size;
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_sha3_224(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_224 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_224_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_224_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_224_DIGEST_SIZE, "sha3-224");
+}
+
+static int verify_io_u_sha3_256(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_256 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_256_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_256_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_256_DIGEST_SIZE, "sha3-256");
+}
+
+static int verify_io_u_sha3_384(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_384 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_384_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_384_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_384_DIGEST_SIZE, "sha3-384");
+}
+
+static int verify_io_u_sha3_512(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_512 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_512_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_512_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_512_DIGEST_SIZE, "sha3-512");
+}
+
 static int verify_io_u_sha512(struct verify_header *hdr, struct vcont *vc)
 {
 	void *p = io_u_verify_off(hdr, vc);
@@ -759,7 +852,7 @@
 	 * state of numberio, that would have been written to each block
 	 * in a previous run of fio, has been reached.
 	 */
-	if ((td_write(td) || td_rw(td)) && (td_min_bs(td) == td_max_bs(td)) &&
+	if (td_write(td) && (td_min_bs(td) == td_max_bs(td)) &&
 	    !td->o.time_based)
 		if (!td->o.verify_only || td->o.loops == 0)
 			if (hdr->numberio != io_u->numberio) {
@@ -881,6 +974,18 @@
 		case VERIFY_SHA512:
 			ret = verify_io_u_sha512(hdr, &vc);
 			break;
+		case VERIFY_SHA3_224:
+			ret = verify_io_u_sha3_224(hdr, &vc);
+			break;
+		case VERIFY_SHA3_256:
+			ret = verify_io_u_sha3_256(hdr, &vc);
+			break;
+		case VERIFY_SHA3_384:
+			ret = verify_io_u_sha3_384(hdr, &vc);
+			break;
+		case VERIFY_SHA3_512:
+			ret = verify_io_u_sha3_512(hdr, &vc);
+			break;
 		case VERIFY_XXHASH:
 			ret = verify_io_u_xxhash(hdr, &vc);
 			break;
@@ -918,6 +1023,56 @@
 	vh->hash = XXH32_digest(state);
 }
 
+static void fill_sha3(struct fio_sha3_ctx *sha3_ctx, void *p, unsigned int len)
+{
+	fio_sha3_update(sha3_ctx, p, len);
+	fio_sha3_final(sha3_ctx);
+}
+
+static void fill_sha3_224(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_224 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_224_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_256(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_256 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_256_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_384(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_384 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_384_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_512(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_512 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_512_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
 static void fill_sha512(struct verify_header *hdr, void *p, unsigned int len)
 {
 	struct vhdr_sha512 *vh = hdr_priv(hdr);
@@ -1012,7 +1167,7 @@
 	hdr->rand_seed = rand_seed;
 	hdr->offset = io_u->offset + header_num * td->o.verify_interval;
 	hdr->time_sec = io_u->start_time.tv_sec;
-	hdr->time_usec = io_u->start_time.tv_usec;
+	hdr->time_usec = io_u->start_time.tv_nsec / 1000;
 	hdr->thread = td->thread_number;
 	hdr->numberio = io_u->numberio;
 	hdr->crc32 = fio_crc32c(p, offsetof(struct verify_header, crc32));
@@ -1033,9 +1188,10 @@
 			 unsigned int header_len)
 {
 	unsigned int data_len;
-	void *data, *p;
+	void *data;
+	char *p;
 
-	p = (void *) hdr;
+	p = (char *) hdr;
 
 	fill_hdr(td, io_u, hdr, header_num, header_len, io_u->rand_seed);
 
@@ -1084,6 +1240,26 @@
 						io_u, hdr->len);
 		fill_sha512(hdr, data, data_len);
 		break;
+	case VERIFY_SHA3_224:
+		dprint(FD_VERIFY, "fill sha3-224 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_224(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_256:
+		dprint(FD_VERIFY, "fill sha3-256 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_256(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_384:
+		dprint(FD_VERIFY, "fill sha3-384 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_384(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_512:
+		dprint(FD_VERIFY, "fill sha3-512 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_512(hdr, data, data_len);
+		break;
 	case VERIFY_XXHASH:
 		dprint(FD_VERIFY, "fill xxhash io_u %p, len %u\n",
 						io_u, hdr->len);
@@ -1211,6 +1387,7 @@
 {
 	if (td->o.verify == VERIFY_CRC32C_INTEL ||
 	    td->o.verify == VERIFY_CRC32C) {
+		crc32c_arm64_probe();
 		crc32c_intel_probe();
 	}
 }
diff -Nru fio-2.16/verify.h fio-3.1/verify.h
--- fio-2.16/verify.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/verify.h	2017-09-28 10:23:20.000000000 +0000
@@ -20,6 +20,10 @@
 	VERIFY_CRC7,			/* crc7 sum data blocks */
 	VERIFY_SHA256,			/* sha256 sum data blocks */
 	VERIFY_SHA512,			/* sha512 sum data blocks */
+	VERIFY_SHA3_224,		/* sha3-224 sum data blocks */
+	VERIFY_SHA3_256,		/* sha3-256 sum data blocks */
+	VERIFY_SHA3_384,		/* sha3-384 sum data blocks */
+	VERIFY_SHA3_512,		/* sha3-512 sum data blocks */
 	VERIFY_XXHASH,			/* xxhash sum data blocks */
 	VERIFY_SHA1,			/* sha1 sum data blocks */
 	VERIFY_PATTERN,			/* verify specific patterns */
@@ -48,6 +52,18 @@
 struct vhdr_md5 {
 	uint32_t md5_digest[4];
 };
+struct vhdr_sha3_224 {
+	uint8_t sha[224 / 8];
+};
+struct vhdr_sha3_256 {
+	uint8_t sha[256 / 8];
+};
+struct vhdr_sha3_384 {
+	uint8_t sha[384 / 8];
+};
+struct vhdr_sha3_512 {
+	uint8_t sha[512 / 8];
+};
 struct vhdr_sha512 {
 	uint8_t sha512[128];
 };
diff -Nru fio-2.16/verify-state.h fio-3.1/verify-state.h
--- fio-2.16/verify-state.h	2016-12-20 06:12:56.000000000 +0000
+++ fio-3.1/verify-state.h	2017-09-28 10:23:20.000000000 +0000
@@ -77,7 +77,7 @@
 
 static inline struct thread_io_list *io_list_next(struct thread_io_list *s)
 {
-	return (void *) s + thread_io_list_sz(s);
+	return (struct thread_io_list *)((char *) s + thread_io_list_sz(s));
 }
 
 static inline void verify_state_gen_name(char *out, size_t size,