#! /bin/bash
# FSQA Test No. 038
#
# This test was motivated by btrfs issues, but it's generic enough as it
# doesn't use any btrfs specific features.
#
# Stress btrfs' block group allocation and deallocation while running fstrim in
# parallel. Part of the goal is also to get data block groups deallocated so
# that new metadata block groups, using the same physical device space ranges,
# get allocated while fstrim is running. This caused several issues ranging
# from invalid memory accesses, kernel crashes, metadata or data corruption,
# free space cache inconsistencies, free space leaks and memory leaks.
#
# These issues were fixed by the following btrfs linux kernel patches:
#
#   Btrfs: fix invalid block group rbtree access after bg is removed
#   Btrfs: fix crash caused by block group removal
#   Btrfs: fix freeing used extents after removing empty block group
#   Btrfs: fix race between fs trimming and block group remove/allocation
#   Btrfs: fix race between writing free space cache and trimming
#   Btrfs: make btrfs_abort_transaction consider existence of new block groups
#   Btrfs: fix memory leak after block remove + trimming
#   Btrfs: fix fs mapping extent map leak
#   Btrfs: fix unprotected deletion from pending_chunks list
#
# The issues were found on a qemu/kvm guest with 4 virtual CPUs, 4Gb of ram and
# scsi-hd devices with discard support enabled (that means hole punching in the
# disk's image file is performed by the host).
#
#-----------------------------------------------------------------------
#
# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
# Author: Filipe Manana <fdmanana@suse.com>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#-----------------------------------------------------------------------
#

seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"

tmp=/tmp/$$
status=1	# failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15

_cleanup()
{
	rm -fr $tmp
}

# get standard environment, filters and checks
. ./common/rc
. ./common/filter

# real QA test starts here
_supported_fs generic
_supported_os Linux
_require_scratch
_require_xfs_io_command "falloc"

rm -f $seqres.full

# Keep allocating and deallocating 1G of data space with the goal of creating
# and deleting 1 block group constantly. The intention is to race with the
# fstrim loop below.
fallocate_loop()
{
	# Wait for running subcommand before exitting so that
	# mountpoint is not busy when we try to unmount it
	trap "wait; exit" SIGTERM

	local name=$1
	while true; do
		$XFS_IO_PROG -f -c "falloc -k 0 1G" \
			$SCRATCH_MNT/$name &> /dev/null
		sleep 3
		$XFS_IO_PROG -c "truncate 0" \
			$SCRATCH_MNT/$name &> /dev/null
		sleep 3
	done
}

trim_loop()
{
	# Wait for running subcommand before exitting so that
	# mountpoint is not busy when we try to unmount it
	trap "wait; exit" SIGTERM

	while true; do
		$FSTRIM_PROG $SCRATCH_MNT
	done
}

# Create a bunch of small files that get their single extent inlined in the
# btree, so that we consume a lot of metadata space and get a chance of a
# data block group getting deleted and reused for metadata later. Sometimes
# the creation of all these files succeeds other times we get ENOSPC failures
# at some point - this depends on how fast the btrfs' cleaner kthread is
# notified about empty block groups, how fast it deletes them and how fast
# the fallocate calls happen. So we don't really care if they all succeed or
# not, the goal is just to keep metadata space usage growing while data block
# groups are deleted.
#
# Creating 200,000 files sequentially is really slow, so speed it up a bit
# by doing it concurrently with 4 threads in 4 separate directories.
nr_files=$((50000 * LOAD_FACTOR))
create_files()
{
	local prefix=$1

	for ((n = 0; n < 4; n++)); do
		mkdir $SCRATCH_MNT/$n
		(
		trap "wait; exit" SIGTERM

		for ((i = 1; i <= $nr_files; i++)); do
			$XFS_IO_PROG -f -c "pwrite -S 0xaa 0 3900" \
				$SCRATCH_MNT/$n/"${prefix}_$i" &> /dev/null
			if [ $? -ne 0 ]; then
				echo "Failed creating file $n/${prefix}_$i" >>$seqres.full
				break
			fi
		done
		) &
		create_pids[$n]=$!
	done

	wait ${create_pids[@]}

}

_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount
_require_fs_space $SCRATCH_MNT $((10 * 1024 * 1024))
_require_batched_discard $SCRATCH_MNT

for ((i = 0; i < $((4 * $LOAD_FACTOR)); i++)); do
	trim_loop &
	trim_pids[$i]=$!
done

for ((i = 0; i < $((1 * $LOAD_FACTOR)); i++)); do
	fallocate_loop "falloc_file_$i" &
	fallocate_pids[$i]=$!
done

create_files "foobar"

kill ${fallocate_pids[@]}
kill ${trim_pids[@]}
wait

# The fstests framework will now check for fs consistency with fsck.
# The trimming was racy and caused some btree nodes to get full of zeroes on
# disk, which obviously caused fs metadata corruption. The race often lead
# to missing free space entries in a block group's free space cache too.

echo "Silence is golden"
status=0
exit
