#!/bin/bash

[ $# -eq 0 ] &&
[ "$CLUSTER_NAME" ] &&
[ "$DAEMON" ] &&
[ "$DAEMON_DIR" ] &&
[ "$PSTORAGE_USER" ] &&
[ "$PID_DIR" ] &&
[ "$PID_FILE" ] &&
[ "$LOG_DIR" ] &&
[ "$RESTART_TIMEOUT" ] || exit 1

RUN=1
LASTERR=0
ERRNUM=0
MAXERRNUM=5
EXITMSG_FILE="$DAEMON_DIR/.exit.msg"
DAEMON_NAME=$(basename "$DAEMON")
SERVICE=$(basename "$DAEMON_NAME" d)
SERVICE_UPPER=$(echo "$SERVICE" | tr [:lower:] [:upper:])
PSTORAGE_SHM_DIR=/dev/shm/vstorage
SERVICEID="init"

# cleanup upstart environment
unset PREVLEVEL RUNLEVEL UPSTART_EVENTS UPSTART_INSTANCE UPSTART_JOB previous runlevel

source /usr/libexec/vstorage/service-lib.sh

shutdown_req() {
	RUN=0
	local pid="$PID"
	if [ "$pid" ]; then
		kill "$pid"
	fi
}

logrotate_req() {
	local pid="$PID"
	if [ "$pid" ]; then
		kill -1 "$pid"
	fi
}

sleep_timeout() {
	local timeout=$1
	while [ $timeout -gt 0 ] ; do
		timeout=$[ $timeout - 5 ]
		sleep 5
	done
}

conv_signal() {
	local signal=$1
	# print nothing on success
	[ "$signal" -eq 0 ] && return;
	local str=""
	case $signal in
		129) str="129 / SIGHUP" ;;
		130) str="130 / SIGINT" ;;
		131) str="131 / SIGQUIT" ;;
		132) str="132 / SIGILL" ;;
		134) str="134 / SIGABRT" ;;
		136) str="136 / SIGFPE" ;;
		137) str="137 / SIGKILL" ;;
		139) str="139 / SIGSEGV" ;;
		141) str="141 / SIGPIPE" ;;
		142) str="142 / SIGALRM" ;;
		143) str="143 / SIGTERM" ;;
		*) str="$signal" ;;
	esac
	echo " (${str})"
}

is_log_valid()
{
	[ -L "$DAEMON_DIR/logs" ] || return 1
	[ -d "$DAEMON_DIR/logs" ] || return 1
	return 0
}

get_log_files()
{
	if ! is_log_valid; then
		mktemp -d "$LOG_DIR/$SERVICE-XXXXXXXX" | if read logdir; then
			ln -sf "$logdir" "$DAEMON_DIR/logs"
		else
			return 1
		fi
	fi
	dir=$(readlink -f "$DAEMON_DIR/logs")
	[ -L "$dir/.repo" ] && rm -f "$dir/.repo"
	chgrp $(id -g "$PSTORAGE_USER") "$dir" &&
	chmod g+rwx "$dir" || return 1
	dir="$DAEMON_DIR/logs"
	LOG_FILE="$dir/$SERVICE.log$LOG_COMPRESS"
	FATAL_FILE="$dir/fatal.log"
}

delete_repo()
{
	RUN=0
	MSG="was deleted"
	SEV="-i"
	$CTL_TOOL -q -c "$CLUSTER_NAME" --timeout=30 rm-"$SERVICE" --local "$DAEMON_DIR"

	start_locate_repo_disk "$DAEMON_DIR"
}

check_host_id()
{
	repo_host_id=`cat "$1"`
	if [ -z "$repo_host_id" ]; then
		echo "Can't read host_id from $1" 1>&2
		exit 1
	fi

	host_id=`cat $CONFIG_DIR/host_id`
	if [ -z "$host_id" ]; then
		echo "Unable read host_id" 1>&2
		exit 1
	fi

	if [ "$host_id" != "$repo_host_id" ] ; then
		echo "Wrong host ID '$repo_host_id'" 1>&2
		exit 1
	fi

	return 0
}

check_hard_error()
{
	# these magic numbers mean that we treate codes in [100,109] range
	# and 134 (SIGABRT) as hard error codes
	[ "$EXIT_CODE" -ge 100 -a "$EXIT_CODE" -lt 110 -o "$EXIT_CODE" -eq 134 ] || return
	MSG="reports hard error"
	if [ "$EXIT_CODE" -ne "$LASTERR" ]; then
		LASTERR="$EXIT_CODE"
		ERRNUM=1
		return
	fi
	ERRNUM=$[$ERRNUM + 1]
	[ "$ERRNUM" -ge "$MAXERRNUM" ] || return
	RUN=0
	MSG="reports hard error $MAXERRNUM times in a row, stop it"
}

make_dir() {
        local dir=$1
        [ -d $dir ] && return 0

        mkdir $dir && chgrp $(id -g "$PSTORAGE_USER") $dir &&
                chmod g+rwx $dir || return 1

        return 0
}

prepare_msg()
{
	if [ "$RUN" -eq 1 ]; then
		case "$EXIT_CODE" in
			0) # ok
				MSG="restarted"
				SEV="-i"
				;;
			*) # error
				MSG="died unexpectedly"
				;;
		esac
	else
		rl=$(runlevel | awk '{print $2}')
		MSG="was stopped"
		SEV="-w"
		[ -f "$PID_DIR/stop-reason" ] && MSG="$MSG "$(cat "$PID_DIR/stop-reason") ||
		case "$rl" in
			0) # halt
				MSG="$MSG during host halt"
				;;
			6) # reboot
				MSG="$MSG during host reboot"
				;;
		esac
	fi
}

finish_msg()
{
	local sigmsg=$(conv_signal "$EXIT_CODE")
	MSG="$SERVICE_UPPER#$SERVICEID ${MSG}${sigmsg}"
	if [ -f "$EXITMSG_FILE" ]; then
		MSG="$MSG: "$(cat "$EXITMSG_FILE")
	fi
}

report_status()
{
	[ -z "$MSG" ] && prepare_msg
	finish_msg
	# send exit message to event log
	$CTL_TOOL -q -c "$CLUSTER_NAME" --timeout=30 put-event "$SEV" -s monitor "$MSG" &
}

monitor_func() {
	trap shutdown_req 2 3 15
	trap logrotate_req HUP

	make_dir $PSTORAGE_SHM_DIR
	make_dir "$PSTORAGE_SHM_DIR/$CLUSTER_NAME"

	while true; do
		rm -f "$EXITMSG_FILE"
		AUTH_OPT=""
		[ -n "$SECURE_AUTH" ] && AUTH_OPT="-a $SECURE_AUTH"

		LOG_ROTATION_OPT=""
		[ -n "$LOG_ROTATION" ] && LOG_ROTATION_OPT="-L $LOG_ROTATION"

		VERBOSITY_OPT=""
		[ -n "$VERBOSITY_LEVEL" ] && VERBOSITY_OPT="-d $VERBOSITY_LEVEL"

		if [ -n "$LIMIT_NUM_FILES" ] ; then
			ulimit -n $LIMIT_NUM_FILES >/dev/null 2>&1
		fi

		"$DAEMON" -r "$DAEMON_DIR" -l "$LOG_FILE" -u "$PSTORAGE_USER" $AUTH_OPT $LOG_ROTATION_OPT $VERBOSITY_OPT >> "$FATAL_FILE" 2>&1 &
		PID=$!
		stop_locate_repo_disk "$DAEMON_DIR"
		while true; do
			wait "$PID"
			# We need to restart wait if it was interrupted by a signal.
			# The problem is that we can't distinguish situations, (1) when wait is
			# interrupted by a signal, and (2) when the background service was
			# terminated by a signal. In both situations wait will return 128 + signo.
			# The solution is to restart wait if it's exit code is in (128, 160)
			# interval. If it's situation 2, restarted wait will return 127, which
			# means, that the background service was terminated.
			code=$?
			[ "$code" -ne 127 ] || break;
			EXIT_CODE="$code"
			[ "$EXIT_CODE" -gt 128 -a "$EXIT_CODE" -lt 160 ] || break;
		done
		unset PID MSG
		SEV="-e"
		if [ "$SERVICEID" = "init" ]; then
			[ -f "$DAEMON_DIR/control/id" ] && SERVICEID=$(cat "$DAEMON_DIR/control/id")
			[ -f "$DAEMON_DIR/id" ] && SERVICEID=$(cat "$DAEMON_DIR/id")
			export SERVICEID
		fi
		[ "$EXIT_CODE" -eq 222 ] && delete_repo;
		check_hard_error
		report_status

		[  ! -d "$DAEMON_DIR" ] && RUN=0

		[ "$RUN" -eq 1 ] || break;
		if [ "$EXIT_CODE" -ne 0 ]; then
			local datetime=$(fmt_time)
			echo "$datetime $MSG" >> "$FATAL_FILE"
			sleep_timeout $RESTART_TIMEOUT
		fi
	done

	rm -rf "$PID_DIR"
}

# check that DAEMON_DIR exists and is not marked for deletion
if [ -d "$DAEMON_DIR" -a "$(ls -A "$DAEMON_DIR")" ]; then
	if [ -e "$DAEMON_DIR/.deleted" ]; then
		delete_repo
		exit 1
	fi

	if [ -f "$DAEMON_DIR/control/host_id" ]; then
		check_host_id "$DAEMON_DIR/control/host_id" "$DAEMON_DIR"
	fi
else
	exit 1
fi

get_log_files || exit 1
[ -f "$DAEMON_DIR/control/id" ] && SERVICEID=$(cat "$DAEMON_DIR/control/id")
[ -f "$DAEMON_DIR/id" ] && SERVICEID=$(cat "$DAEMON_DIR/id")
export SERVICEID
exec </dev/null >/dev/null 2>&1
monitor_func &
echo $! > "$PID_FILE"
disown -h
exit 0
