#! /bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (C) 2021 SUSE Linux Products GmbH. All Rights Reserved.
#
# FSQA Test No. 236
#
# Test for fsync data loss after renaming a file or adding a hard link, with a
# previous fsync of another file, as well as that mtime and ctime are correct.
# Test both with COW and NOCOW writes.
#
. ./common/preamble
_begin_fstest auto quick log

# Override the default cleanup function.
_cleanup()
{
	_cleanup_flakey
	cd /
	rm -f $tmp.*
}

. ./common/filter
. ./common/dmflakey

_require_scratch
_require_dm_target flakey

# The comments inside the function mentioning specific inode numbers and IDs
# (transactions, log commits, etc) are for the case where the function is run
# on a freshly created filesystem, but the logic and reasoning still applies
# for future invocations.
test_fsync()
{
	local suffix=$1
	local op_type=$2
	local foo="$SCRATCH_MNT/foo_$suffix"
	local bar="$SCRATCH_MNT/bar_$suffix"
	local baz="$SCRATCH_MNT/baz_$suffix"
	local digest_before
	local digest_after
	local mtime_before
	local ctime_before
	local mtime_after
	local ctime_after

	if [ "$op_type" != "rename" ] && [ "$op_type" != "link" ]; then
		_fail "Operation type, 2nd argument, must be 'rename' or 'link'"
	fi

	# Create an extra empty file to be used later for syncing the log and
	# bumping the committed log transaction ID by +1.
	touch $bar

	# This is the file we are interested in testing for data loss after an
	# fsync. Create it and fill it with initial data, then fsync it.
	#
	# Before the fsync the inode has the full sync flag set, the current
	# log transaction has an ID of 0 (first log transaction), the inode's
	# ->logged_trans is 0, ->last_sub_trans is 0 and ->last_log_commit is -1.
	#
	# After the fsync the full sync flag is not set anymore, ->logged_trans
	# is 6 (generation of the current transaction), ->last_log_commit is 0
	# (which is the value of ->last_sub_trans) and ->last_sub_trans remains
	# as 0.
	#
	# Also, after the fsync, the log transaction ID is bumped from 0 to 1.
	#
	$XFS_IO_PROG -f -c "pwrite -S 0xab 0 1M" -c "fsync" $baz >>$seqres.full
	# File bar is inode 257.
	# File baz is inode 258.

	# During the rename or link operation below the following happens:
	#
	# We update inode 258 and that causes its ->last_sub_trans to be set to
	# 1 (the current log transaction ID), and its ->last_log_commit remains
	# with a value of 0 (meaning it was committed in the previous log
	# transaction).
	#
	# After updating inode 258, because we have previously fsynced it, we
	# log again the inode because it has a new name/dentry now. This results
	# in updating its ->last_log_commit from 0 to 1 (the current value of
	# its ->last_sub_trans).
	#
	if [ "$op_type" == "rename" ]; then
		mv $baz $foo
	else
		ln $baz $foo
	fi
	# File bar is inode 257.
	# File foo is inode 258.

	# Before the next write, sleep for 1 second so that we can test if mtime
	# and ctime are preserved after the power failure.
	sleep 1

	# This buffered write leaves ->last_sub_trans of inode 258 as 1, the ID
	# of the current log transaction (inode->root->log_transid).
	$XFS_IO_PROG -c "pwrite -S 0xcd 0 1M" $foo >>$seqres.full

	# The fsync against inode 257, file bar, results in committing the log
	# transaction with ID 1, updating inode->root->last_log_commit to 1, and
	# bumping root->log_transid from 1 to 2.
	$XFS_IO_PROG -c "fsync" $bar

	# Now on fsync of inode 258, file foo, delalloc is flushed and, because
	# the inode does not have the full sync flag set anymore, it only waits
	# for the writeback to finish, it does not wait for the ordered extent
	# to complete.
	#
	# If the ordered extent does not complete before btrfs_sync_file() calls
	# btrfs_inode_in_log(), then we would not update the inode the log and
	# sync the log, resulting in a guaranteed data loss after a power failure
	# for COW writes and with slim chances for data loss as well for NOCOW
	# writes, because the fsync would return success to user space without
	# issuing barriers (REQ_PREFLUSH not sent to the block layer).
	# That happened because btrfs_inode_in_log() would return true before the
	# ordered extent completes, as in that case the inode has ->last_sub_trans
	# set to 1, ->last_log_commit as 1 and root->last_log_commit is 1 as well.
	#
	# Also, for both COW and NOCOW writes, when the race happens we ended up
	# not logging the inode, resulting in an outdated mtime and ctime after
	# replaying the log.
	#
	$XFS_IO_PROG -c "fsync" $foo

	# Simulate a power failure and then mount again the filesystem to replay
	# the log tree.
	# After the power failure We expect $foo, inode 258, to have 1M of data
	# full of bytes with a value of 0xcd, and not 0xab.

	digest_before=$(_md5_checksum $foo)
	mtime_before=$(stat -c %Y $foo)
	ctime_before=$(stat -c %Z $foo)

	_flakey_drop_and_remount

	digest_after=$(_md5_checksum $foo)
	mtime_after=$(stat -c %Y $foo)
	ctime_after=$(stat -c %Z $foo)

	if [ $digest_after != $digest_before ]; then
		echo -n "Incorrect data after log replay, "
		echo "expected digest: $digest_before got: $digest_after"
	fi

	if [ $mtime_after != $mtime_before ]; then
		echo "mtime not preserved, expected: $mtime_before got: $mtime_after"
	fi

	if [ $ctime_after != $ctime_before ]; then
		echo "ctime not preserved, expected: $ctime_before got: $ctime_after"
	fi
}

# Test first with data cow (it's the default, but be explicit to make it clear).
_scratch_mkfs >>$seqres.full 2>&1
_require_metadata_journaling $SCRATCH_DEV
_init_flakey
MOUNT_OPTIONS="-o datacow"
_mount_flakey

# Test a few times each scenario because this test was motivated by a race
# condition.
echo "Testing fsync after rename with COW writes"
for ((i = 1; i <= 3; i++)); do
	test_fsync "rename_cow_$i" "rename"
done
echo "Testing fsync after link with COW writes"
for ((i = 1; i <= 3; i++)); do
	test_fsync "link_cow_$i" "link"
done

_unmount_flakey

# Now lets test with nodatacow.
if ! _scratch_btrfs_is_zoned; then
	MOUNT_OPTIONS="-o nodatacow"
	_mount_flakey

	echo "Testing fsync after rename with NOCOW writes"
	for ((i = 1; i <= 3; i++)); do
		test_fsync "rename_nocow_$i" "rename"
	done
	echo "Testing fsync after link with NOCOW writes"
	for ((i = 1; i <= 3; i++)); do
		test_fsync "link_nocow_$i" "link"
	done

	_unmount_flakey
else
	# Fake result. Zoned btrfs does not support NOCOW
	echo "Testing fsync after rename with NOCOW writes"
	echo "Testing fsync after link with NOCOW writes"
fi

status=0
exit
