no way to compare when less than two revisions
Differences
This shows you the differences between two versions of the page.
— | monitors:sge [2009/11/23 06:10] (current) – created - external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== My Monitor (CHANGEME) ====== | ||
+ | |||
+ | ^ Author | [[ butchdeal@yahoo.com | Butch Deal ]] | | ||
+ | ^ Compatibility | Xymon 4.2 | | ||
+ | ^ Requirements | SGE | | ||
+ | ^ Download | None | | ||
+ | ^ Last Update | 2009-06-02 | | ||
+ | |||
+ | ===== Description ===== | ||
+ | This script will alert if any local queues are in error, and will return some basic information such as listing the queues and jobs on the local host. It will also give the host definition and list any errors found. Version 0.2, now marks disabled queues as clear instead of error. Version 0.3, added unknown queue status, and cut down on the number of qhost runs. | ||
+ | < | ||
+ | Jobs | ||
+ | |||
+ | job-ID | ||
+ | ----------------------------------------------------------------------------------------------------------------- | ||
+ | 4839010 0.60000 ovl_p.huma bwalenz | ||
+ | 4839988 0.59902 ovl_d.anan bwalenz | ||
+ | 4839988 0.59902 ovl_d.anan bwalenz | ||
+ | 4835225 0.50000 blastp | ||
+ | 4835351 0.50000 blastp | ||
+ | 4835471 0.50000 blastp | ||
+ | 4835507 0.50000 blastp | ||
+ | |||
+ | |||
+ | Host | ||
+ | |||
+ | HOSTNAME | ||
+ | ------------------------------------------------------------------------------- | ||
+ | dell-1-0-6 | ||
+ | |||
+ | |||
+ | Queue Instance Status Report | ||
+ | |||
+ | |||
+ | yellow default.q@dell-1-0-6 is in ALARM | ||
+ | green fast.q@dell-1-0-6 is OK | ||
+ | green medium.q@dell-1-0-6 is OK | ||
+ | green troubleshooting.q@dell-1-0-6 is OK | ||
+ | </ | ||
+ | ===== Installation ===== | ||
+ | === Client side === | ||
+ | Edit sge.sh and set SGEBIN path. | ||
+ | Add this to clientlaunch.cfg : | ||
+ | < | ||
+ | [sge] | ||
+ | ENVFILE $HOBBITCLIENTHOME/ | ||
+ | CMD $HOBBITCLIENTHOME/ | ||
+ | LOGFILE $HOBBITCLIENTHOME/ | ||
+ | INTERVAL 5m | ||
+ | </ | ||
+ | === Server side === | ||
+ | |||
+ | ===== Source ===== | ||
+ | ==== sge.sh ==== | ||
+ | <hidden onHidden=" | ||
+ | < | ||
+ | # | ||
+ | # | ||
+ | # SGE: Sun Grid Engine check - BB external script test | ||
+ | # | ||
+ | ##### Purpose is to report back to a central server, all Sun | ||
+ | ##### Grid Engine software faults. | ||
+ | ##### | ||
+ | # | ||
+ | # version 0.3 | ||
+ | # | ||
+ | # BIG BROTHER / XXXXXXXXXXXXXXXX status | ||
+ | # | ||
+ | # Written by Butch Deal < | ||
+ | # | ||
+ | # | ||
+ | # v0.3e 10/14/08 cut down on the number of qhost runs | ||
+ | # v0.3d 03/31/06 added alarm/ | ||
+ | # v0.3c 03/01/06 propogated yellow state upon UNAVAILABLE queue intances | ||
+ | # v0.3b 01/31/06 fixed yellow warning queue status for ambigious config test | ||
+ | # v0.3a 01/31/06 added unknown queue status and ambigious config test | ||
+ | # v0.3 01/26/06 fixed status reporting and optimized job status | ||
+ | # v0.2 08/03/05 flag disabled queues as clear | ||
+ | # v0.1 07/28/05 authored | ||
+ | |||
+ | ######################################## | ||
+ | # NOTE | ||
+ | # This has been tested with BB 1.9e and Xymon 4.2.x | ||
+ | # | ||
+ | # The color status with respects to queue status is arbitrary and should be | ||
+ | # reviewed for your particular environment. | ||
+ | # | ||
+ | # Tested on : | ||
+ | # | ||
+ | ######################################## | ||
+ | |||
+ | ######################################## | ||
+ | # INSTALLATION | ||
+ | # step 1 - update bb-bbexttab to include this script | ||
+ | # | ||
+ | # step 4 - restart Big Brother | ||
+ | # | ||
+ | # NOTE - the TEST variable in the configuration section, this is the name used | ||
+ | # as the column header. | ||
+ | ######################################## | ||
+ | |||
+ | ################################## | ||
+ | # CONFIGURE IT HERE | ||
+ | ################################## | ||
+ | TEST=" | ||
+ | BBPROG=" | ||
+ | |||
+ | SGEBIN=/ | ||
+ | QSTAT=${SGEBIN}/ | ||
+ | QHOST=${SGEBIN}/ | ||
+ | QSELECT=${SGEBIN}/ | ||
+ | export SGEBIN QSTAT QHOST QSELECT | ||
+ | |||
+ | # define colours for graphics | ||
+ | # Comment these out if using older BB versions | ||
+ | CLEAR_PIC="& | ||
+ | RED_PIC="& | ||
+ | YELLOW_PIC="& | ||
+ | GREEN_PIC="& | ||
+ | UNKNOWN_PIC="& | ||
+ | |||
+ | ################################## | ||
+ | # Start of script | ||
+ | ################################## | ||
+ | BBHOME="/ | ||
+ | |||
+ | if test ! " | ||
+ | then | ||
+ | echo " | ||
+ | exit 1 | ||
+ | fi | ||
+ | |||
+ | if test ! -d " | ||
+ | then | ||
+ | echo " | ||
+ | exit 1 | ||
+ | fi | ||
+ | |||
+ | if test ! " | ||
+ | then | ||
+ | # echo "*** LOADING BBDEF ***" | ||
+ | . $BBHOME/ | ||
+ | fi | ||
+ | |||
+ | get_header() | ||
+ | { | ||
+ | echo "" | ||
+ | #echo "< | ||
+ | echo "< | ||
+ | # If you do not want the header in a bigger font use line below instead | ||
+ | #echo "< | ||
+ | # If you want the "Paul Luzzi" look uncomment this section and comment | ||
+ | # out the above sections: | ||
+ | #echo "< | ||
+ | #echo "< | ||
+ | #echo "< | ||
+ | #echo "< | ||
+ | #echo "< | ||
+ | } | ||
+ | get_header_small() | ||
+ | { | ||
+ | echo "" | ||
+ | #echo "< | ||
+ | echo "< | ||
+ | # If you do not want the header in a bigger font use line below instead | ||
+ | # echo "< | ||
+ | # If you want the "Paul Luzzi" look uncomment this section and comment | ||
+ | # out the above sections: | ||
+ | #echo "< | ||
+ | #echo "< | ||
+ | #echo "< | ||
+ | #echo "< | ||
+ | #echo "< | ||
+ | } | ||
+ | |||
+ | |||
+ | get_footer() | ||
+ | { | ||
+ | echo "" | ||
+ | # If you want the "Paul Luzzi" look uncomment this section and comment | ||
+ | # out the above sections: | ||
+ | #echo "</ | ||
+ | } | ||
+ | |||
+ | ##### | ||
+ | ##### Get Status proc - used to get all responses | ||
+ | ##### | ||
+ | get_status() | ||
+ | { | ||
+ | ##### | ||
+ | ##### Setup some variables for use later | ||
+ | ##### | ||
+ | COLOR=" | ||
+ | |||
+ | # Check defaults have been set | ||
+ | if [ " | ||
+ | SGEBIN=/ | ||
+ | echo "" | ||
+ | echo " | ||
+ | fi | ||
+ | |||
+ | if [ " | ||
+ | QSTAT=${SGEBIN}/ | ||
+ | echo "" | ||
+ | echo " | ||
+ | fi | ||
+ | |||
+ | if [ " | ||
+ | QHOST=${SGEBIN}/ | ||
+ | echo "" | ||
+ | echo " | ||
+ | fi | ||
+ | |||
+ | ### | ||
+ | ### Check the jobs | ||
+ | ### | ||
+ | get_header " | ||
+ | jobs=`${QSTAT} -l hostname=${MACHINEDOTS} -s r` | ||
+ | if [ -z " | ||
+ | echo "No Running Jobs" | ||
+ | else | ||
+ | | ||
+ | fi | ||
+ | get_footer | ||
+ | |||
+ | ### | ||
+ | ### Check the host | ||
+ | ### | ||
+ | get_header " | ||
+ | ${QHOST} -h ${MACHINEDOTS} | ${GREP} -v " | ||
+ | get_footer | ||
+ | |||
+ | ### | ||
+ | ### Identify queue memberships | ||
+ | ### | ||
+ | #get_header "Queue Membership" | ||
+ | #${QHOST} -h ${MACHINEDOTS} -q | ${TAIL} +5 | ||
+ | #get_footer | ||
+ | |||
+ | ### | ||
+ | ### Check queue instance states | ||
+ | ### | ||
+ | queueTriggered=false; | ||
+ | # Queueset=`${QHOST} -h ${MACHINEDOTS} -q | ${TAIL} +5 | ${AWK} '{ print $1 }'` | ||
+ | ${QHOST} -h ${MACHINEDOTS} -q | ${TAIL} +5 > $BBTMP/ | ||
+ | Queueset=`cat $BBTMP/ | ||
+ | for Qset in $Queueset; do | ||
+ | # qstate=`${QHOST} -q -h ${MACHINEDOTS} | ${TAIL} +5 | $GREP " $Qset" | $AWK ' | ||
+ | qstate=`cat $BBTMP/ | ||
+ | |||
+ | # Order determines more significant color status | ||
+ | if [ "`echo $qstate | $GREP -c d`" != " | ||
+ | COLOR=" | ||
+ | queueMsg=`echo " | ||
+ | queueTriggered=true; | ||
+ | elif [ "`echo $qstate | $GREP -c E`" != " | ||
+ | COLOR=" | ||
+ | queueMsg=`echo " | ||
+ | queueTriggered=true; | ||
+ | elif [ "`echo $qstate | $GREP -c c`" != " | ||
+ | COLOR=" | ||
+ | queueMsg=`echo " | ||
+ | queueTriggered=true; | ||
+ | elif [ "`echo $qstate | $GREP -c a`" != " | ||
+ | [ "`echo $qstate | $GREP -c A`" != " | ||
+ | queueMsg=`echo " | ||
+ | elif [ "`echo $qstate | $GREP -c s`" != " | ||
+ | [ "`echo $qstate | $GREP -c S`" != " | ||
+ | queueMsg=`echo " | ||
+ | elif [ "`echo $qstate | $GREP -c u`" != " | ||
+ | COLOR=" | ||
+ | queueMsg=`echo " | ||
+ | queueTriggered=true; | ||
+ | elif [ " | ||
+ | queueMsg=`echo " | ||
+ | else | ||
+ | queueMsg=`echo " | ||
+ | queueTriggered=true; | ||
+ | fi | ||
+ | done | ||
+ | |||
+ | if [ -f $BBTMP/ | ||
+ | $RM $BBTMP/ | ||
+ | fi | ||
+ | |||
+ | get_header "Queue Instance Status Report" | ||
+ | echo " | ||
+ | get_footer | ||
+ | |||
+ | ##### | ||
+ | ##### Make sure to export COLOR so that it gets back to " | ||
+ | ##### | ||
+ | export COLOR | ||
+ | |||
+ | ##### | ||
+ | ##### End of get_status proc | ||
+ | ##### | ||
+ | } | ||
+ | |||
+ | ##### | ||
+ | ##### Main body | ||
+ | ##### | ||
+ | get_status > $BBTMP/ | ||
+ | |||
+ | # NOW USE THE BB COMMAND TO SEND THE DATA ACROSS | ||
+ | $BB $BBDISP " | ||
+ | #For testing only | ||
+ | # echo $BB $BBDISP " | ||
+ | |||
+ | |||
+ | # Clean up our mess | ||
+ | # Checking for existence of each file since the whole test may be optional | ||
+ | # and may not actually run on every client | ||
+ | # | ||
+ | if [ -f $BBTMP/ | ||
+ | $RM $BBTMP/ | ||
+ | fi | ||
+ | ############################################## | ||
+ | # end of script | ||
+ | ############################################## | ||
+ | </ | ||
+ | </ | ||
+ | |||
+ | ===== Known Bugs and Issues ===== | ||
+ | |||
+ | ===== To Do ===== | ||
+ | |||
+ | ===== Credits ===== | ||
+ | * Daniel Gomez | ||
+ | * Butch Deal | ||
+ | ===== Changelog ===== | ||
+ | |||
+ | * **YYYY-MM-DD** | ||
+ | * Initial release | ||