monitors:sge

My Monitor (CHANGEME)

Author Butch Deal
Compatibility Xymon 4.2
Requirements SGE
Download None
Last Update 2009-06-02

This script will alert if any local queues are in error, and will return some basic information such as listing the queues and jobs on the local host. It will also give the host definition and list any errors found. Version 0.2, now marks disabled queues as clear instead of error. Version 0.3, added unknown queue status, and cut down on the number of qhost runs.

Jobs 

job-ID  prior   name       user         state submit/start at     queue                          slots ja-task-ID 
-----------------------------------------------------------------------------------------------------------------
4839010 0.60000 ovl_p.huma bwalenz      r     06/02/2009 13:18:39 medium.q@dell-1-0-6.tigr.org       2 10
4839988 0.59902 ovl_d.anan bwalenz      r     06/02/2009 15:32:37 medium.q@dell-1-0-6.tigr.org       2 17
4839988 0.59902 ovl_d.anan bwalenz      r     06/02/2009 15:37:25 medium.q@dell-1-0-6.tigr.org       2 21
4835225 0.50000 blastp     djohnson     r     06/02/2009 09:56:29 default.q@dell-1-0-6.tigr.org      1        
4835351 0.50000 blastp     djohnson     r     06/02/2009 10:11:21 default.q@dell-1-0-6.tigr.org      1        
4835471 0.50000 blastp     djohnson     r     06/02/2009 11:11:05 default.q@dell-1-0-6.tigr.org      1        
4835507 0.50000 blastp     djohnson     r     06/02/2009 11:16:08 default.q@dell-1-0-6.tigr.org      1        


Host 

HOSTNAME                ARCH         NCPU  LOAD  MEMTOT  MEMUSE  SWAPTO  SWAPUS
-------------------------------------------------------------------------------
dell-1-0-6              lx26-eon64      8 11.60   15.7G    6.5G   16.0G   51.2M


Queue Instance Status Report 


yellow default.q@dell-1-0-6 is in ALARM
green fast.q@dell-1-0-6 is OK
green medium.q@dell-1-0-6 is OK
green troubleshooting.q@dell-1-0-6 is OK

Client side

Edit sge.sh and set SGEBIN path. Add this to clientlaunch.cfg :

[sge]
	ENVFILE $HOBBITCLIENTHOME/etc/hobbitclient.cfg
	CMD $HOBBITCLIENTHOME/ext/sge.sh
	LOGFILE $HOBBITCLIENTHOME/logs/sge.log
	INTERVAL 5m

Server side

Show Code ⇲

Hide Code ⇱

#!/bin/sh 
#
# SGE: Sun Grid Engine check - BB external script test
#
#####  Purpose is to report back to a central server, all Sun
#####     Grid Engine software faults.
#####
#
# version 0.3
#
# BIG BROTHER / XXXXXXXXXXXXXXXX status
#
# Written by Butch Deal <butchdeal@yahoo.com>
#	     Daniel Gomez <dgomez@tigr.org,daniel@ixplosive.com>
#
# v0.3e 10/14/08 cut down on the number of qhost runs
# v0.3d 03/31/06 added alarm/suspend state identification 
# v0.3c 03/01/06 propogated yellow state upon UNAVAILABLE queue intances
# v0.3b 01/31/06 fixed yellow warning queue status for ambigious config test 
# v0.3a 01/31/06 added unknown queue status and ambigious config test 
# v0.3 01/26/06 fixed status reporting and optimized job status 
# v0.2 08/03/05 flag disabled queues as clear
# v0.1 07/28/05 authored

########################################
# NOTE
# This has been tested with BB 1.9e and Xymon 4.2.x
#
# The color status with respects to queue status is arbitrary and should be
# reviewed for your particular environment.
#
# Tested on :
#   Solaris & Linux
########################################

########################################
# INSTALLATION
#  step 1  - update bb-bbexttab to include this script
#
#  step 4 - restart Big Brother
#
# NOTE - the TEST variable in the configuration section, this is the name used
#        as the column header.
########################################

##################################
# CONFIGURE IT HERE
##################################
TEST="sge"
BBPROG="$0"; export BBPROG

SGEBIN=/usr/local/bin
QSTAT=${SGEBIN}/qstat
QHOST=${SGEBIN}/qhost
QSELECT=${SGEBIN}/qselect
export SGEBIN QSTAT QHOST QSELECT

# define colours for graphics
# Comment these out if using older BB versions
CLEAR_PIC="&clear"
RED_PIC="&red"
YELLOW_PIC="&yellow"
GREEN_PIC="&green"
UNKNOWN_PIC="&purple"

##################################
# Start of script
##################################
BBHOME="/var/BB/bb"; export BBHOME

if test ! "$BBHOME"
then
        echo "template: BBHOME is not set"
        exit 1
fi

if test ! -d "$BBHOME"
then
        echo "template: BBHOME is invalid"
        exit 1
fi

if test ! "$BBTMP"                      # GET DEFINITIONS IF NEEDED
then
         # echo "*** LOADING BBDEF ***"
        . $BBHOME/etc/bbdef.sh          # INCLUDE STANDARD DEFINITIONS
fi

get_header()
{
  echo ""
  #echo "<FONT SIZE=+2><b>$1</b></FONT> ($2)<BR>"
  echo "<FONT SIZE=+2><b>$1</b></FONT> <BR>"
  # If you do not want the header in a bigger font use line below instead
  #echo "<b>$1</b> ($2)"
  # If you want the "Paul Luzzi" look uncomment this section and comment
  # out the above sections:
  #echo "<P><DIV ALIGN=\"CENTER\"><HR>" 
  #echo "<B>============== $1 ==============</B>"
  #echo "<B>--- ($2) ---</B>"
  #echo "<HR></DIV>" 
  #echo "<BLOCKQUOTE>" 
}
get_header_small()
{
  echo ""
  #echo "<FONT SIZE=+2><b>$1</b></FONT> ($2)<BR>"
  echo "<FONT SIZE=+1><b>$1</b></FONT> <BR>"
  # If you do not want the header in a bigger font use line below instead
  # echo "<b>$1</b> ($2)"
  # If you want the "Paul Luzzi" look uncomment this section and comment
  # out the above sections:
  #echo "<P><DIV ALIGN=\"CENTER\"><HR>" 
  #echo "<B>============== $1 ==============</B>"
  #echo "<B>--- ($2) ---</B>"
  #echo "<HR></DIV>" 
  #echo "<BLOCKQUOTE>" 
}


get_footer()
{
  echo ""
  # If you want the "Paul Luzzi" look uncomment this section and comment
  # out the above sections:
  #echo "</BLOCKQUOTE>"
}

#####
#####  Get Status proc - used to get all responses
#####
get_status()
{
  #####
  #####  Setup some variables for use later
  #####
  COLOR="green"

  # Check defaults have been set
  if [ "$SGEBIN" = "" ]; then
    SGEBIN=/usr/local/bin
    echo ""
    echo "$YELLOW_PIC SGEBIN command is not defined in etc/bbsys.local - using default: $SGEBIN"
  fi

  if [ "$QSTAT" = "" ]; then
    QSTAT=${SGEBIN}/qstat
    echo ""
    echo "$YELLOW_PIC QSTAT command is not defined in etc/bbsys.local - using default: $QSTAT"
  fi

  if [ "$QHOST" = "" ]; then
    QHOST=${SGEBIN}/qhost
    echo ""
    echo "$YELLOW_PIC QHOST command is not defined in etc/bbsys.local - using default: $QHOST"
  fi

  ###
  ### Check the jobs
  ###
  get_header "Jobs" "$QSTAT -l hostname=$MACHINEDOTS"
  jobs=`${QSTAT} -l hostname=${MACHINEDOTS} -s r`
  if [ -z "$jobs" ]; then
     echo "No Running Jobs"
  else
     ${QSTAT} -l hostname=${MACHINEDOTS} -s r
  fi
  get_footer

  ###
  ### Check the host
  ###
  get_header "Host" "$METAHS -i"
  ${QHOST} -h ${MACHINEDOTS} | ${GREP} -v "global"
  get_footer

  ###
  ### Identify queue memberships
  ###
  #get_header "Queue Membership" "$QHOST -q"
  #${QHOST} -h ${MACHINEDOTS} -q | ${TAIL} +5 
  #get_footer

  ###
  ### Check queue instance states
  ###
  queueTriggered=false;
#  Queueset=`${QHOST} -h ${MACHINEDOTS} -q | ${TAIL} +5 | ${AWK} '{ print $1 }'`
  ${QHOST} -h ${MACHINEDOTS} -q | ${TAIL} +5 > $BBTMP/$MACHINE.$TEST.QSTATE
  Queueset=`cat $BBTMP/$MACHINE.$TEST.QSTATE | ${AWK} '{ print $1 }'`
  for Qset in $Queueset; do  	
#    qstate=`${QHOST} -q -h ${MACHINEDOTS} | ${TAIL} +5 | $GREP " $Qset" | $AWK '{print $4}'`
    qstate=`cat $BBTMP/$MACHINE.$TEST.QSTATE | $GREP " $Qset" | $AWK '{print $4}'`

    # Order determines more significant color status
    if [ "`echo $qstate | $GREP -c d`" != "0" ]; then
      COLOR="clear"
      queueMsg=`echo "$queueMsg<BR>$CLEAR_PIC $Qset@$HOST is DISABLED"`
      queueTriggered=true;
    elif [ "`echo $qstate | $GREP -c E`" != "0" ]; then
      COLOR="red"
      queueMsg=`echo "$queueMsg<BR>$RED_PIC $Qset@$HOST is in ERROR!"`
      queueTriggered=true;
    elif [ "`echo $qstate | $GREP -c c`" != "0" ]; then
      COLOR="yellow"
      queueMsg=`echo "$queueMsg<BR>$YELLOW_PIC $Qset@$HOST has an ambigious configuration!"`
      queueTriggered=true;
    elif [ "`echo $qstate | $GREP -c a`" != "0" ] || \
         [ "`echo $qstate | $GREP -c A`" != "0" ]; then
      queueMsg=`echo "$queueMsg<BR>$YELLOW_PIC $Qset@$HOST is in ALARM"`
    elif [ "`echo $qstate | $GREP -c s`" != "0" ] || \
         [ "`echo $qstate | $GREP -c S`" != "0" ]; then
      queueMsg=`echo "$queueMsg<BR>$YELLOW_PIC $Qset@$HOST is SUSPENDED"`
    elif [ "`echo $qstate | $GREP -c u`" != "0" ]; then
      COLOR="yellow"
      queueMsg=`echo "$queueMsg<BR>$YELLOW_PIC $Qset@$HOST is UNAVAILABLE!"`
      queueTriggered=true;
    elif [ "$qstate" = "" ]; then
      queueMsg=`echo "$queueMsg<BR>$GREEN_PIC $Qset@$HOST is OK"`
    else
      queueMsg=`echo "$queueMsg<BR>$UNKNOWN_PIC $Qset@$HOST is UNKNOWN"`
      queueTriggered=true;
    fi
  done 

  if [ -f $BBTMP/$MACHINE.$TEST.QSTATE ]; then
    $RM $BBTMP/$MACHINE.$TEST.QSTATE
  fi

  get_header "Queue Instance Status Report"
  echo "$queueMsg"
  get_footer

  #####
  #####  Make sure to export COLOR so that it gets back to "central"
  #####
  export COLOR

#####
#####  End of get_status proc
#####
}

#####
#####  Main body
#####
  get_status > $BBTMP/$MACHINE.$TEST

  # NOW USE THE BB COMMAND TO SEND THE DATA ACROSS
  $BB $BBDISP "status $MACHINE.$TEST $COLOR `$DATE` `$CAT $BBTMP/$MACHINE.$TEST`"
	#For testing only
#      echo $BB $BBDISP "status $BBTMP/$MACHINE.$TEST $COLOR `$DATE` `$CAT $BBTMP/$MACHINE.$TEST` ">/tmp/qtmp


# Clean up our mess
# Checking for existence of each file since the whole test may be optional
#   and may not actually run on every client
#
if [ -f $BBTMP/$MACHINE.$TEST ]; then
  $RM $BBTMP/$MACHINE.$TEST
fi
##############################################
# end of script
##############################################
  • Daniel Gomez
  • Butch Deal
  • YYYY-MM-DD
    • Initial release
  • monitors/sge.txt
  • Last modified: 2009/11/23 06:10
  • by 127.0.0.1