Author | Butch Deal |
Compatibility | Xymon 4.2 |
Requirements | SGE |
Download | None |
Last Update | 2009-06-02 |
This script will alert if any local queues are in error, and will return some basic information such as listing the queues and jobs on the local host. It will also give the host definition and list any errors found. Version 0.2, now marks disabled queues as clear instead of error. Version 0.3, added unknown queue status, and cut down on the number of qhost runs.
Jobs
job-ID prior name user state submit/start at queue slots ja-task-ID
-----------------------------------------------------------------------------------------------------------------
4839010 0.60000 ovl_p.huma bwalenz r 06/02/2009 13:18:39 medium.q@dell-1-0-6.tigr.org 2 10
4839988 0.59902 ovl_d.anan bwalenz r 06/02/2009 15:32:37 medium.q@dell-1-0-6.tigr.org 2 17
4839988 0.59902 ovl_d.anan bwalenz r 06/02/2009 15:37:25 medium.q@dell-1-0-6.tigr.org 2 21
4835225 0.50000 blastp djohnson r 06/02/2009 09:56:29 default.q@dell-1-0-6.tigr.org 1
4835351 0.50000 blastp djohnson r 06/02/2009 10:11:21 default.q@dell-1-0-6.tigr.org 1
4835471 0.50000 blastp djohnson r 06/02/2009 11:11:05 default.q@dell-1-0-6.tigr.org 1
4835507 0.50000 blastp djohnson r 06/02/2009 11:16:08 default.q@dell-1-0-6.tigr.org 1
Host
HOSTNAME ARCH NCPU LOAD MEMTOT MEMUSE SWAPTO SWAPUS
-------------------------------------------------------------------------------
dell-1-0-6 lx26-eon64 8 11.60 15.7G 6.5G 16.0G 51.2M
Queue Instance Status Report
yellow default.q@dell-1-0-6 is in ALARM
green fast.q@dell-1-0-6 is OK
green medium.q@dell-1-0-6 is OK
green troubleshooting.q@dell-1-0-6 is OK
Client side
Edit sge.sh and set SGEBIN path.
Add this to clientlaunch.cfg :
[sge]
ENVFILE $HOBBITCLIENTHOME/etc/hobbitclient.cfg
CMD $HOBBITCLIENTHOME/ext/sge.sh
LOGFILE $HOBBITCLIENTHOME/logs/sge.log
INTERVAL 5m
Server side
#!/bin/sh
#
# SGE: Sun Grid Engine check - BB external script test
#
##### Purpose is to report back to a central server, all Sun
##### Grid Engine software faults.
#####
#
# version 0.3
#
# BIG BROTHER / XXXXXXXXXXXXXXXX status
#
# Written by Butch Deal <butchdeal@yahoo.com>
# Daniel Gomez <dgomez@tigr.org,daniel@ixplosive.com>
#
# v0.3e 10/14/08 cut down on the number of qhost runs
# v0.3d 03/31/06 added alarm/suspend state identification
# v0.3c 03/01/06 propogated yellow state upon UNAVAILABLE queue intances
# v0.3b 01/31/06 fixed yellow warning queue status for ambigious config test
# v0.3a 01/31/06 added unknown queue status and ambigious config test
# v0.3 01/26/06 fixed status reporting and optimized job status
# v0.2 08/03/05 flag disabled queues as clear
# v0.1 07/28/05 authored
########################################
# NOTE
# This has been tested with BB 1.9e and Xymon 4.2.x
#
# The color status with respects to queue status is arbitrary and should be
# reviewed for your particular environment.
#
# Tested on :
# Solaris & Linux
########################################
########################################
# INSTALLATION
# step 1 - update bb-bbexttab to include this script
#
# step 4 - restart Big Brother
#
# NOTE - the TEST variable in the configuration section, this is the name used
# as the column header.
########################################
##################################
# CONFIGURE IT HERE
##################################
TEST="sge"
BBPROG="$0"; export BBPROG
SGEBIN=/usr/local/bin
QSTAT=${SGEBIN}/qstat
QHOST=${SGEBIN}/qhost
QSELECT=${SGEBIN}/qselect
export SGEBIN QSTAT QHOST QSELECT
# define colours for graphics
# Comment these out if using older BB versions
CLEAR_PIC="&clear"
RED_PIC="&red"
YELLOW_PIC="&yellow"
GREEN_PIC="&green"
UNKNOWN_PIC="&purple"
##################################
# Start of script
##################################
BBHOME="/var/BB/bb"; export BBHOME
if test ! "$BBHOME"
then
echo "template: BBHOME is not set"
exit 1
fi
if test ! -d "$BBHOME"
then
echo "template: BBHOME is invalid"
exit 1
fi
if test ! "$BBTMP" # GET DEFINITIONS IF NEEDED
then
# echo "*** LOADING BBDEF ***"
. $BBHOME/etc/bbdef.sh # INCLUDE STANDARD DEFINITIONS
fi
get_header()
{
echo ""
#echo "<FONT SIZE=+2><b>$1</b></FONT> ($2)<BR>"
echo "<FONT SIZE=+2><b>$1</b></FONT> <BR>"
# If you do not want the header in a bigger font use line below instead
#echo "<b>$1</b> ($2)"
# If you want the "Paul Luzzi" look uncomment this section and comment
# out the above sections:
#echo "<P><DIV ALIGN=\"CENTER\"><HR>"
#echo "<B>============== $1 ==============</B>"
#echo "<B>--- ($2) ---</B>"
#echo "<HR></DIV>"
#echo "<BLOCKQUOTE>"
}
get_header_small()
{
echo ""
#echo "<FONT SIZE=+2><b>$1</b></FONT> ($2)<BR>"
echo "<FONT SIZE=+1><b>$1</b></FONT> <BR>"
# If you do not want the header in a bigger font use line below instead
# echo "<b>$1</b> ($2)"
# If you want the "Paul Luzzi" look uncomment this section and comment
# out the above sections:
#echo "<P><DIV ALIGN=\"CENTER\"><HR>"
#echo "<B>============== $1 ==============</B>"
#echo "<B>--- ($2) ---</B>"
#echo "<HR></DIV>"
#echo "<BLOCKQUOTE>"
}
get_footer()
{
echo ""
# If you want the "Paul Luzzi" look uncomment this section and comment
# out the above sections:
#echo "</BLOCKQUOTE>"
}
#####
##### Get Status proc - used to get all responses
#####
get_status()
{
#####
##### Setup some variables for use later
#####
COLOR="green"
# Check defaults have been set
if [ "$SGEBIN" = "" ]; then
SGEBIN=/usr/local/bin
echo ""
echo "$YELLOW_PIC SGEBIN command is not defined in etc/bbsys.local - using default: $SGEBIN"
fi
if [ "$QSTAT" = "" ]; then
QSTAT=${SGEBIN}/qstat
echo ""
echo "$YELLOW_PIC QSTAT command is not defined in etc/bbsys.local - using default: $QSTAT"
fi
if [ "$QHOST" = "" ]; then
QHOST=${SGEBIN}/qhost
echo ""
echo "$YELLOW_PIC QHOST command is not defined in etc/bbsys.local - using default: $QHOST"
fi
###
### Check the jobs
###
get_header "Jobs" "$QSTAT -l hostname=$MACHINEDOTS"
jobs=`${QSTAT} -l hostname=${MACHINEDOTS} -s r`
if [ -z "$jobs" ]; then
echo "No Running Jobs"
else
${QSTAT} -l hostname=${MACHINEDOTS} -s r
fi
get_footer
###
### Check the host
###
get_header "Host" "$METAHS -i"
${QHOST} -h ${MACHINEDOTS} | ${GREP} -v "global"
get_footer
###
### Identify queue memberships
###
#get_header "Queue Membership" "$QHOST -q"
#${QHOST} -h ${MACHINEDOTS} -q | ${TAIL} +5
#get_footer
###
### Check queue instance states
###
queueTriggered=false;
# Queueset=`${QHOST} -h ${MACHINEDOTS} -q | ${TAIL} +5 | ${AWK} '{ print $1 }'`
${QHOST} -h ${MACHINEDOTS} -q | ${TAIL} +5 > $BBTMP/$MACHINE.$TEST.QSTATE
Queueset=`cat $BBTMP/$MACHINE.$TEST.QSTATE | ${AWK} '{ print $1 }'`
for Qset in $Queueset; do
# qstate=`${QHOST} -q -h ${MACHINEDOTS} | ${TAIL} +5 | $GREP " $Qset" | $AWK '{print $4}'`
qstate=`cat $BBTMP/$MACHINE.$TEST.QSTATE | $GREP " $Qset" | $AWK '{print $4}'`
# Order determines more significant color status
if [ "`echo $qstate | $GREP -c d`" != "0" ]; then
COLOR="clear"
queueMsg=`echo "$queueMsg<BR>$CLEAR_PIC $Qset@$HOST is DISABLED"`
queueTriggered=true;
elif [ "`echo $qstate | $GREP -c E`" != "0" ]; then
COLOR="red"
queueMsg=`echo "$queueMsg<BR>$RED_PIC $Qset@$HOST is in ERROR!"`
queueTriggered=true;
elif [ "`echo $qstate | $GREP -c c`" != "0" ]; then
COLOR="yellow"
queueMsg=`echo "$queueMsg<BR>$YELLOW_PIC $Qset@$HOST has an ambigious configuration!"`
queueTriggered=true;
elif [ "`echo $qstate | $GREP -c a`" != "0" ] || \
[ "`echo $qstate | $GREP -c A`" != "0" ]; then
queueMsg=`echo "$queueMsg<BR>$YELLOW_PIC $Qset@$HOST is in ALARM"`
elif [ "`echo $qstate | $GREP -c s`" != "0" ] || \
[ "`echo $qstate | $GREP -c S`" != "0" ]; then
queueMsg=`echo "$queueMsg<BR>$YELLOW_PIC $Qset@$HOST is SUSPENDED"`
elif [ "`echo $qstate | $GREP -c u`" != "0" ]; then
COLOR="yellow"
queueMsg=`echo "$queueMsg<BR>$YELLOW_PIC $Qset@$HOST is UNAVAILABLE!"`
queueTriggered=true;
elif [ "$qstate" = "" ]; then
queueMsg=`echo "$queueMsg<BR>$GREEN_PIC $Qset@$HOST is OK"`
else
queueMsg=`echo "$queueMsg<BR>$UNKNOWN_PIC $Qset@$HOST is UNKNOWN"`
queueTriggered=true;
fi
done
if [ -f $BBTMP/$MACHINE.$TEST.QSTATE ]; then
$RM $BBTMP/$MACHINE.$TEST.QSTATE
fi
get_header "Queue Instance Status Report"
echo "$queueMsg"
get_footer
#####
##### Make sure to export COLOR so that it gets back to "central"
#####
export COLOR
#####
##### End of get_status proc
#####
}
#####
##### Main body
#####
get_status > $BBTMP/$MACHINE.$TEST
# NOW USE THE BB COMMAND TO SEND THE DATA ACROSS
$BB $BBDISP "status $MACHINE.$TEST $COLOR `$DATE` `$CAT $BBTMP/$MACHINE.$TEST`"
#For testing only
# echo $BB $BBDISP "status $BBTMP/$MACHINE.$TEST $COLOR `$DATE` `$CAT $BBTMP/$MACHINE.$TEST` ">/tmp/qtmp
# Clean up our mess
# Checking for existence of each file since the whole test may be optional
# and may not actually run on every client
#
if [ -f $BBTMP/$MACHINE.$TEST ]; then
$RM $BBTMP/$MACHINE.$TEST
fi
##############################################
# end of script
##############################################
Known Bugs and Issues