#!/bin/sh
# PCP QA Test No. 870
# Various checks around the pmlogger control and run files in
# $PCP_TMP_DIR/pmlogger and $PCP_RUN_DIR
#
# With a single --check option, is silent except if there is a problem
# and runs just the integrity check.  This could be used from check or
# check.callback
#
# Copyright (c) 2016 Ken McDonell.  All Rights Reserved.
#

seq=`basename $0`

check=false
if [ $# -ge 1 -a "$1" = "--check" ]
then
    check=true
    # if 2nd arg present then use this as $seq ... we're being called from
    # check.callback more than likely
    #
    [ $# -ge 2 -a "$2" != "" ] && seq="$2"
fi

$check || echo "QA output created by $seq"

# get standard environment, filters and checks
. ./common.product
. ./common.filter
. ./common.check

status=0	# success is the default!
$sudo rm -rf $tmp $tmp.*
$check || $sudo rm -f $seq.full
trap "cd $here; $sudo rm -rf $tmp $tmp.*; exit \$status" 0 1 2 3 15

_check()
{
    # start by iterating over all running pmlogger processes
    #

    if $check
    then
	echo "--- running 870 --check @ `date` ---" >>$here/$seq.full
    fi
    _get_pids_by_name pmlogger >$tmp.pmids
    echo "_get_pids_by_name pmlogger ..." >>$here/$seq.full
    cat $tmp.pmids >>$here/$seq.full
    $PCP_PS_PROG $PCP_PS_ALL_FLAGS >$tmp.ps.out
    echo "$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep pmlogger ..." >>$here/$seq.full
    grep pmlogger <$tmp.ps.out >>$here/$seq.full
    if [ -s $tmp.pmids ]
    then
	# at least one is running
	for pid in `cat $tmp.pmids`
	do
	    $PCP_AWK_PROG <$tmp.ps.out >$tmp.ps.pid '
$2 == '"$pid"'	{ print }'

	    # control file(s) ...
	    #
	    if [ -f $PCP_TMP_DIR/pmlogger/$pid ]
	    then
		# control file exists
		if grep 'pmlogger .*-P ' <$tmp.ps.pid >/dev/null
		then
		    # primary logger
		    if [ -L $PCP_TMP_DIR/pmlogger/primary ]
		    then
			check_pid=`ls -l $PCP_TMP_DIR/pmlogger/primary | sed -e 's/.*\///'`
			if [ "$check_pid" = "$pid" ]
			then
			    # all is well
			    if $check
			    then
				:
			    else
				echo "control file check is OK"
			    fi
			else
			    echo "Error: $PCP_TMP_DIR/pmlogger/primary linked to pid $check_pid instead of $pid"
			    head -1 $tmp.ps.out
			    $PCP_AWK_PROG <$tmp.ps.out '
$2 == '"$check_pid"'	{ print }'
			    cat $tmp.ps.pid
			    ls -l $PCP_TMP_DIR/pmlogger/*
			    $check && touch $tmp.err
			fi
		    else
			echo "Error: $PCP_TMP_DIR/pmlogger/primary missing for ..."
			head -1 $tmp.ps.out
			cat $tmp.ps.pid
			ls -l $PCP_TMP_DIR/pmlogger/*
			$check && touch $tmp.err
		    fi
		else
		    # nothing more to check
		    :
		fi
	    else
		echo "Error: $PCP_TMP_DIR/pmlogger/$pid missing for ..."
		head -1 $tmp.ps.out
		cat $tmp.ps.pid
		ls -l $PCP_TMP_DIR/pmlogger/*
		$check && touch $tmp.err
	    fi

	    # run file(s) ...
	    #
	    if [ -S $PCP_RUN_DIR/pmlogger.$pid.socket ]
	    then
		# run file exists
		if grep 'pmlogger .*-P ' <$tmp.ps.pid >/dev/null
		then
		    # primary logger
		    if [ -L $PCP_RUN_DIR/pmlogger.primary.socket ]
		    then
			check_pid=`ls -l $PCP_RUN_DIR/pmlogger.primary.socket | sed -e 's/.*\///' -e 's/pmlogger\.//' -e 's/\.socket//'`
			if [ "$check_pid" = "$pid" ]
			then
			    # all is well
			    if $check
			    then
				:
			    else
				echo "run file check is OK"
			    fi
			else
			    echo "Error: $PCP_RUN_DIR/pmlogger.primary.socket linked to pid $check_pid instead of $pid"
			    head -1 $tmp.ps.out
			    $PCP_AWK_PROG <$tmp.ps.out '
$2 == '"$check_pid"'	{ print }'
			    cat $tmp.ps.pid
			    ls -l $PCP_RUN_DIR/pmlogger.*
			    $check && touch $tmp.err
			fi
		    else
			echo "Error: $PCP_RUN_DIR/pmlogger.primary.socket missing for ..."
			head -1 $tmp.ps.out
			cat $tmp.ps.pid
			ls -l $PCP_RUN_DIR/pmlogger.*
			$check && touch $tmp.err
		    fi
		else
		    # nothing more to check
		    :
		fi
	    else
		echo "Error: $PCP_RUN_DIR/pmlogger.$pid.socket missing for ..."
		head -1 $tmp.ps.out
		cat $tmp.ps.pid
		ls -l $PCP_RUN_DIR/pmlogger.*
		$check && touch $tmp.err
	    fi

	done
    else
	echo "Warning: no pmloggers running"
	$check && touch $tmp.err
    fi

    # Check for dead control files with no pmlogger attached
    #
    if [ -d $PCP_TMP_DIR/pmlogger ]
    then
	for file in `cd $PCP_TMP_DIR/pmlogger; ls`
	do
	    case "$file"
	    in
		primary)
			    pid=`ls -l $PCP_TMP_DIR/pmlogger/primary | sed -e 's/.*\///'`
			    ;;
		*)
			    pid="$file"
			    ;;
	    esac
	    $PCP_AWK_PROG <$tmp.ps.out >$tmp.ps.pid '
$2 == '"$pid"'	{ print }'
	    if [ -s $tmp.ps.pid ]
	    then
		# OK
		:
	    else
		# Need to new snapshot of ps in case some pmlogger
		# has been started since the last snapshot ...
		# and check again
		#
		$PCP_PS_PROG $PCP_PS_ALL_FLAGS >$tmp.ps.out
		$PCP_AWK_PROG <$tmp.ps.out >$tmp.ps.pid '
$2 == '"$pid"'	{ print }'
		if [ -s $tmp.ps.pid ]
		then
		    # OK
		    :
		else
		    echo "Warning: no matching pmlogger process for ..." >>$here/$seq.full
		    ls -l $PCP_TMP_DIR/pmlogger/$file >>$here/$seq.full
		    if $check
		    then
			$sudo rm -f $PCP_TMP_DIR/pmlogger/$file
			echo "File removed." >>$here/$seq.full
		    fi
		fi
	    fi
	    done
	else
	    echo "Error: directory $PCP_TMP_DIR/pmlogger missing!"
	    $check && touch $tmp.err
	fi

    # Check for dead run files with no pmlogger attached
    #
    if [ -d $PCP_RUN_DIR ]
    then
	for file in `cd $PCP_RUN_DIR; ls`
	do
	    pid=''
	    case "$file"
	    in
		pmlogger.primary.socket)
			    pid=`ls -l $PCP_RUN_DIR/pmlogger.primary.socket | sed -e 's/.*\///' -e 's/pmlogger\.//' -e 's/\.socket//'`
			    ;;
		pmlogger.pid)
			    pid=`cat $PCP_RUN_DIR/$file`
			    ;;
		pmlogger.*)
			    pid=`echo $file | sed -e 's/pmlogger\.//' -e 's/\.socket//'`
			    ;;
	    esac
	    [ -z "$pid" ] && continue
	    $PCP_AWK_PROG <$tmp.ps.out >$tmp.ps.pid '
$2 == '"$pid"'	{ print }'
	    if [ -s $tmp.ps.pid ]
	    then
		# OK
		:
	    else
		# Need to new snapshot of ps in case some pmlogger
		# has been started since the last snapshot ...
		# and check again
		#
		$PCP_PS_PROG $PCP_PS_ALL_FLAGS >$tmp.ps.out
		$PCP_AWK_PROG <$tmp.ps.out >$tmp.ps.pid '
$2 == '"$pid"'	{ print }'
		if [ -s $tmp.ps.pid ]
		then
		    # OK
		    :
		else
		    echo "Warning: no matching pmlogger process for ..." >>$here/$seq.full
		    ls -l $PCP_RUN_DIR/$file >>$here/$seq.full
		    if $check
		    then
			$sudo rm -f $PCP_RUN_DIR/$file
			case "$file"
			in
			    *.socket)	echo "Socket removed." >>$here/$seq.full
					;;
			    *)		echo "File removed." >>$here/$seq.full
					;;
			esac
		    fi
		fi
	    fi
	done
    else
	echo "Error: directory $PCP_RUN_DIR missing!"
	$check && touch $tmp.err
    fi

    # Did a QA test leave behind a modified pmlogger control file?
    #
    marker='PCP QA test'
    if grep "$marker" $PCP_PMLOGGERCONTROL_PATH >/dev/null
    then
	echo "Error: $PCP_PMLOGGERCONTROL_PATH changed for QA, may not be correct"
	grep "$marker" $PCP_PMLOGGERCONTROL_PATH
	$check && touch $tmp.err
    fi

    # And finally, at most 1 primary pmlogger
    #
    num_primary=`grep 'pmlogger .*-P ' <$tmp.ps.out | wc -l | sed -e 's/ //g'`
    if [ "$num_primary" -gt 1 ]
    then
	echo "Error: more than one primary logger running ..."
	head -1 $tmp.ps.out
	grep 'pmlogger .*-P ' <$tmp.ps.out
	$check && touch $tmp.err
    fi
}


# real QA test starts here
_check

if $check
then
    if [ -f $tmp.err ]
    then
	echo "Now: `date`"
	status=1
    fi
    exit
fi

echo
echo "start another primary pmlogger (expect failure) ..." | tee -a $here/$seq.full
_start_up_pmlogger -Dcontext,pmlc -L -c /dev/null -l $tmp.log -P $tmp.arch
echo "pmlogger pid $pid" >>$here/$seq.full
for i in 1 2 3 4 5
do
    present=`$PCP_PS_PROG $PCP_PS_ALL_FLAGS \
	     | $PCP_AWK_PROG '$2 == '"$pid"' { print "true"; exit } END { print "false" }'`
    $present || break
    sleep 1
done
if $present
then
    echo "Arrggh ... pmlogger appears to be running ..."
    $PCP_PS_PROG $PCP_PS_ALL_FLAGS \
    | grep -E "PID|$pid" \
    | grep -v egrep
    echo "kill off the primary pmlogger we just started ..."
    $sudo kill -KILL $pid
    for i in 1 2 3 4 5
    do
	present=`$PCP_PS_PROG $PCP_PS_ALL_FLAGS \
		 | $PCP_AWK_PROG '$2 == '"$pid"' { print "true"; exit } END { print "false" }'`
	$present || break
	sleep 1
    done
    if $present
    then
	echo "Arrggh ... failed to kill of pmlogger (pid $pid)"
	$PCP_PS_PROG $PCP_PS_ALL_FLAGS \
	| grep -E "PID|$pid" \
	| grep -v egrep
    fi
fi
grep -E 'ERROR:|info:' <$tmp.log \
| sed \
    -e 's/pid [0-9][0-9]*/pid SOMEPID/g' \
    -e "s@$PCP_TMP_DIR/@PCP_TMP_DIR/@g"

cat $tmp.log >>$here/$seq.full
_check

exit
