| |
| — | monitors:lsf_queue [2009/11/23 05:52] (current) – created - external edit 127.0.0.1 |
|---|
| | ====== lsf_queues ====== |
| | |
| | ^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] | |
| | ^ Compatibility | Xymon 4.2 | |
| | ^ Requirements | Perl, unix, Platform LSF | |
| | ^ Download | None | |
| | ^ Last Update | 2007-04-08 | |
| | |
| | ===== Description ===== |
| | |
| | This script is used to report the number of jobs in running/pending/suspended state |
| | for each queue in an ncv-compatible fashion, allowing queue graphing. |
| | |
| | ===== Installation ===== |
| | === Client side === |
| | * Copy the script on a farm node in client's ext folder (eg; /usr/lib/hobbit/client/ext on linux/debian) with owner hobbit.hobbit and rights 0755 |
| | * Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Exemple on a linux/debian host:<code> |
| | [lsfqueues] |
| | ENVFILE /usr/lib/hobbit/client/etc/hobbitclient.cfg |
| | CMD /usr/lib/hobbit/client/ext/lsf_queues.pl |
| | INTERVAL 5m |
| | </code> |
| | |
| | === Server side === |
| | * In hobbitserver.cfg: |
| | - Append "lsf_queues=ncv" to TEST2RRD |
| | - Append "lsf_queues" to GRAPHS |
| | - Add the following line for NCV:<code> |
| | NCV_lsf_queues="*:GAUGE" |
| | </code> |
| | * You need an entry in one of the bb-hosts file:<code> |
| | 1.2.3.4 myfarm # noconn TRENDS:*,lsf_queues:lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues |
| | </code>or if you want to combine it with the [[monitors:lsf_mon|lsf_mon script]]:<code> |
| | 1.2.3.4 myfarm # noconn TRENDS:*,lsf,lsf_queues:lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues |
| | </code> |
| | * Edit the hobbitgraph.cfg configuration to add the following entries. Please note that I manually entered an entry for each queue I wanted to display. You will obviously need to change it to adapt it to your setup. |
| | <hidden onHidden="Show Code ⇲" onVisible="Hide Code ⇱"> |
| | <code> |
| | [lsf_pending_queues] |
| | DEF:queue1=lsf_queues.rrd:queue1pending:AVERAGE |
| | DEF:queue2=lsf_queues.rrd:queue2pending:AVERAGE |
| | DEF:queue3=lsf_queues.rrd:queue3pending:AVERAGE |
| | DEF:queue4=lsf_queues.rrd:queue4pending:AVERAGE |
| | DEF:queue5=lsf_queues.rrd:queue5pending:AVERAGE |
| | DEF:queue6=lsf_queues.rrd:queue6pending:AVERAGE |
| | DEF:queue7=lsf_queues.rrd:queue7pending:AVERAGE |
| | DEF:queue8=lsf_queues.rrd:queue8pending:AVERAGE |
| | TITLE LSF queues: number of pending jobs per queue |
| | -l 0 |
| | YAXIS # |
| | AREA:queue1#99ccff:queue1 |
| | GPRINT:queue1:LAST: \: %5.1lf (cur) |
| | GPRINT:queue1:MAX: \: %5.1lf (max) |
| | GPRINT:queue1:MIN: \: %5.1lf (min) |
| | GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue2#cc99ff:queue2 |
| | GPRINT:queue2:LAST: \: %5.1lf (cur) |
| | GPRINT:queue2:MAX: \: %5.1lf (max) |
| | GPRINT:queue2:MIN: \: %5.1lf (min) |
| | GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue3#9933ff:queue3 |
| | GPRINT:queue3:LAST: \: %5.1lf (cur) |
| | GPRINT:queue3:MAX: \: %5.1lf (max) |
| | GPRINT:queue3:MIN: \: %5.1lf (min) |
| | GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue4#3399ff:queue4 |
| | GPRINT:queue4:LAST: \: %5.1lf (cur) |
| | GPRINT:queue4:MAX: \: %5.1lf (max) |
| | GPRINT:queue4:MIN: \: %5.1lf (min) |
| | GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue5#ff3333:queue5 |
| | GPRINT:queue5:LAST: \: %5.1lf (cur) |
| | GPRINT:queue5:MAX: \: %5.1lf (max) |
| | GPRINT:queue5:MIN: \: %5.1lf (min) |
| | GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue6#ff9933:queue6 |
| | GPRINT:queue6:LAST: \: %5.1lf (cur) |
| | GPRINT:queue6:MAX: \: %5.1lf (max) |
| | GPRINT:queue6:MIN: \: %5.1lf (min) |
| | GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue7#00ff00:queue7 |
| | GPRINT:queue7:LAST: \: %5.1lf (cur) |
| | GPRINT:queue7:MAX: \: %5.1lf (max) |
| | GPRINT:queue7:MIN: \: %5.1lf (min) |
| | GPRINT:queue7:AVERAGE: \: %5.1lf (avg) |
| | |
| | [lsf_running_queues] |
| | DEF:queue1=lsf_queues.rrd:queue1running:AVERAGE |
| | DEF:queue2=lsf_queues.rrd:queue2running:AVERAGE |
| | DEF:queue3=lsf_queues.rrd:queue3running:AVERAGE |
| | DEF:queue4=lsf_queues.rrd:queue4running:AVERAGE |
| | DEF:queue5=lsf_queues.rrd:queue5running:AVERAGE |
| | DEF:queue6=lsf_queues.rrd:queue6running:AVERAGE |
| | DEF:queue7=lsf_queues.rrd:queue7running:AVERAGE |
| | DEF:queue8=lsf_queues.rrd:queue8running:AVERAGE |
| | TITLE LSF queues: number of running jobs per queue |
| | -l 0 |
| | YAXIS # |
| | AREA:queue1#99ccff:queue1 |
| | GPRINT:queue1:LAST: \: %5.1lf (cur) |
| | GPRINT:queue1:MAX: \: %5.1lf (max) |
| | GPRINT:queue1:MIN: \: %5.1lf (min) |
| | GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue2#cc99ff:queue2 |
| | GPRINT:queue2:LAST: \: %5.1lf (cur) |
| | GPRINT:queue2:MAX: \: %5.1lf (max) |
| | GPRINT:queue2:MIN: \: %5.1lf (min) |
| | GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue3#9933ff:queue3 |
| | GPRINT:queue3:LAST: \: %5.1lf (cur) |
| | GPRINT:queue3:MAX: \: %5.1lf (max) |
| | GPRINT:queue3:MIN: \: %5.1lf (min) |
| | GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue4#3399ff:queue4 |
| | GPRINT:queue4:LAST: \: %5.1lf (cur) |
| | GPRINT:queue4:MAX: \: %5.1lf (max) |
| | GPRINT:queue4:MIN: \: %5.1lf (min) |
| | GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue5#ff3333:queue5 |
| | GPRINT:queue5:LAST: \: %5.1lf (cur) |
| | GPRINT:queue5:MAX: \: %5.1lf (max) |
| | GPRINT:queue5:MIN: \: %5.1lf (min) |
| | GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue6#ff9933:queue6 |
| | GPRINT:queue6:LAST: \: %5.1lf (cur) |
| | GPRINT:queue6:MAX: \: %5.1lf (max) |
| | GPRINT:queue6:MIN: \: %5.1lf (min) |
| | GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue7#00ff00:queue7 |
| | GPRINT:queue7:LAST: \: %5.1lf (cur) |
| | GPRINT:queue7:MAX: \: %5.1lf (max) |
| | GPRINT:queue7:MIN: \: %5.1lf (min) |
| | GPRINT:queue7:AVERAGE: \: %5.1lf (avg) |
| | |
| | [lsf_suspended_queues] |
| | DEF:queue1=lsf_queues.rrd:queue1suspended:AVERAGE |
| | DEF:queue2=lsf_queues.rrd:queue2suspended:AVERAGE |
| | DEF:queue3=lsf_queues.rrd:queue3suspended:AVERAGE |
| | DEF:queue4=lsf_queues.rrd:queue4suspended:AVERAGE |
| | DEF:queue5=lsf_queues.rrd:queue5suspended:AVERAGE |
| | DEF:queue6=lsf_queues.rrd:queue6suspended:AVERAGE |
| | DEF:queue7=lsf_queues.rrd:queue7suspended:AVERAGE |
| | DEF:queue8=lsf_queues.rrd:queue8suspended:AVERAGE |
| | TITLE LSF queues: number of suspended jobs per queue |
| | -l 0 |
| | YAXIS # |
| | AREA:queue1#99ccff:queue1 |
| | GPRINT:queue1:LAST: \: %5.1lf (cur) |
| | GPRINT:queue1:MAX: \: %5.1lf (max) |
| | GPRINT:queue1:MIN: \: %5.1lf (min) |
| | GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue2#cc99ff:queue2 |
| | GPRINT:queue2:LAST: \: %5.1lf (cur) |
| | GPRINT:queue2:MAX: \: %5.1lf (max) |
| | GPRINT:queue2:MIN: \: %5.1lf (min) |
| | GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue3#9933ff:queue3 |
| | GPRINT:queue3:LAST: \: %5.1lf (cur) |
| | GPRINT:queue3:MAX: \: %5.1lf (max) |
| | GPRINT:queue3:MIN: \: %5.1lf (min) |
| | GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue4#3399ff:queue4 |
| | GPRINT:queue4:LAST: \: %5.1lf (cur) |
| | GPRINT:queue4:MAX: \: %5.1lf (max) |
| | GPRINT:queue4:MIN: \: %5.1lf (min) |
| | GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue5#ff3333:queue5 |
| | GPRINT:queue5:LAST: \: %5.1lf (cur) |
| | GPRINT:queue5:MAX: \: %5.1lf (max) |
| | GPRINT:queue5:MIN: \: %5.1lf (min) |
| | GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue6#ff9933:queue6 |
| | GPRINT:queue6:LAST: \: %5.1lf (cur) |
| | GPRINT:queue6:MAX: \: %5.1lf (max) |
| | GPRINT:queue6:MIN: \: %5.1lf (min) |
| | GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:queue7#00ff00:queue7 |
| | GPRINT:queue7:LAST: \: %5.1lf (cur) |
| | GPRINT:queue7:MAX: \: %5.1lf (max) |
| | GPRINT:queue7:MIN: \: %5.1lf (min) |
| | GPRINT:queue7:AVERAGE: \: %5.1lf (avg) |
| | |
| | [lsf_queues] |
| | DEF:queue1p=lsf_queues.rrd:queue1pending:AVERAGE |
| | DEF:queue2p=lsf_queues.rrd:queue2pending:AVERAGE |
| | DEF:queue3p=lsf_queues.rrd:queue3pending:AVERAGE |
| | DEF:queue4p=lsf_queues.rrd:queue4pending:AVERAGE |
| | DEF:queue5p=lsf_queues.rrd:queue5pending:AVERAGE |
| | DEF:queue6p=lsf_queues.rrd:queue6pending:AVERAGE |
| | DEF:queue7p=lsf_queues.rrd:queue7pending:AVERAGE |
| | DEF:queue8p=lsf_queues.rrd:queue8pending:AVERAGE |
| | DEF:queue1r=lsf_queues.rrd:queue1running:AVERAGE |
| | DEF:queue2r=lsf_queues.rrd:queue2running:AVERAGE |
| | DEF:queue3r=lsf_queues.rrd:queue3running:AVERAGE |
| | DEF:queue4r=lsf_queues.rrd:queue4running:AVERAGE |
| | DEF:queue5r=lsf_queues.rrd:queue5running:AVERAGE |
| | DEF:queue6r=lsf_queues.rrd:queue6running:AVERAGE |
| | DEF:queue7r=lsf_queues.rrd:queue7running:AVERAGE |
| | DEF:queue8r=lsf_queues.rrd:queue8running:AVERAGE |
| | DEF:queue1s=lsf_queues.rrd:queue1suspended:AVERAGE |
| | DEF:queue2s=lsf_queues.rrd:queue2suspended:AVERAGE |
| | DEF:queue3s=lsf_queues.rrd:queue3suspended:AVERAGE |
| | DEF:queue4s=lsf_queues.rrd:queue4suspended:AVERAGE |
| | DEF:queue5s=lsf_queues.rrd:queue5suspended:AVERAGE |
| | DEF:queue6s=lsf_queues.rrd:queue6suspended:AVERAGE |
| | DEF:queue7s=lsf_queues.rrd:queue7suspended:AVERAGE |
| | DEF:queue8s=lsf_queues.rrd:queue8suspended:AVERAGE |
| | DEF:queue8=lsf_queues.rrd:queue8running:AVERAGE |
| | |
| | CDEF:pending=queue1p,queue2p,+,queue3p,+,queue4p,+,queue5p,+,queue6p,+,queue7p,+,queue8p,+ |
| | |
| | CDEF:running=queue1r,queue2r,+,queue3r,+,queue4r,+,queue5r,+,queue6r,+,queue7r,+,queue8r,+ |
| | |
| | CDEF:suspended=queue1s,queue2s,+,queue3s,+,queue4s,+,queue5s,+,queue6s,+,queue7s,+,queue8s,+ |
| | TITLE LSF queues |
| | -l 0 |
| | YAXIS # |
| | AREA:running#cc99ff:running |
| | GPRINT:running:LAST: \: %5.1lf (cur) |
| | GPRINT:running:MAX: \: %5.1lf (max) |
| | GPRINT:running:MIN: \: %5.1lf (min) |
| | GPRINT:running:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:suspended#9933ff:suspended |
| | GPRINT:suspended:LAST: \: %5.1lf (cur) |
| | GPRINT:suspended:MAX: \: %5.1lf (max) |
| | GPRINT:suspended:MIN: \: %5.1lf (min) |
| | GPRINT:suspended:AVERAGE: \: %5.1lf (avg)\n |
| | STACK:pending#99ccff:pending |
| | GPRINT:pending:LAST: \: %5.1lf (cur) |
| | GPRINT:pending:MAX: \: %5.1lf (max) |
| | GPRINT:pending:MIN: \: %5.1lf (min) |
| | </code> |
| | </hidden> |
| | |
| | ===== Source ===== |
| | ==== lsf_queues.pl ==== |
| | <hidden onHidden="Show Code ⇲" onVisible="Hide Code ⇱"> |
| | <code perl> |
| | #!/usr/bin/perl -w |
| | # |
| | # client-side script to monitor queues on a lsf HPC farm |
| | # |
| | # copyright 2006 - 2007 Genome Research Limited / Gildas Le Nadan |
| | # This script is released under the Gnu Public |
| | # License (GPL) version 2 and Later |
| | |
| | my $version = 1.0.0; |
| | |
| | use strict; |
| | |
| | ##### PARAMETERS YOU CAN TWEAK |
| | |
| | # you can debug this script in your environment by setting 1 below and running |
| | # BB=echo BBDISP=127.0.0.1 ./lsf_mon.pl on one of your nodes |
| | my $DEBUG = 0; # 1 for debug, 0 otherwise |
| | |
| | # hobbit config |
| | my $farm = "farm-login"; # the hostname you want to report under |
| | my $hobbitcolumn = "lsf_queues"; # the column name for the test |
| | my $color = "green"; # default color |
| | my $summary = "LSF queues report"; # title of the test output |
| | |
| | # how the lsf commands must be run |
| | my @bqueues=`. /software/noarch/lsf/conf/profile.lsf && /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/bqueues`; |
| | |
| | # lines to ignore in the output |
| | my $bqueues_banner="QUEUE_NAME PRIO STATUS MAX JL/U JL/P JL/H NJOBS PEND RUN SUSP"; |
| | |
| | #### INTERNAL PARAMETERS (NO TWEAKING REQUIRED) |
| | |
| | my $bb = ""; |
| | my $bbdisp = ""; |
| | |
| | my %hqueues = (); |
| | |
| | ### FUNCTIONS |
| | |
| | sub print_summary { |
| | my $result = "Queues\n"; |
| | foreach my $queue ( sort keys %hqueues ) { |
| | $result .= sprintf( "%s pending : %4u\n", $queue, $hqueues{$queue}{pend}); |
| | $result .= sprintf( "%s running : %4u\n", $queue, $hqueues{$queue}{run}); |
| | $result .= sprintf( "%s suspended: %4u\n\n", $queue, $hqueues{$queue}{susp}); |
| | } |
| | return $result; |
| | } |
| | |
| | sub process_bqueues { |
| | foreach my $line ( @bqueues ) { |
| | unless ( $line =~ /$bqueues_banner/ ) { |
| | my @fields = split( " ", $line ); |
| | $hqueues{$fields[0]}{pend} = $fields[8]; |
| | $hqueues{$fields[0]}{run} = $fields[9]; |
| | $hqueues{$fields[0]}{susp} = $fields[10]; |
| | } |
| | } |
| | } |
| | |
| | sub send_report { |
| | my ( $statusmsg ) = @_; |
| | # Build the command we use to send a status to the Xymon daemon |
| | my $cmd = $bb . " " . $bbdisp . " \"status " . $farm . "." . $hobbitcolumn . " " . $color . " " . $summary . "\n\n" . $statusmsg . "\""; |
| | # And send the message |
| | system $cmd; |
| | } |
| | |
| | #### MAIN |
| | |
| | # Get the BB and BBDISP environment settings. |
| | $bb = $ENV{"BB"} || die "BB not defined"; |
| | $bbdisp = $ENV{"BBDISP"} || die "BBDISP not defined"; |
| | |
| | my $statusmsg = ""; |
| | |
| | process_bqueues; |
| | $statusmsg .= print_summary; |
| | send_report( $statusmsg ); |
| | </code> |
| | </hidden> |
| | |
| | ===== Known Bugs and Issues ===== |
| | |
| | ===== To Do ===== |
| | |
| | ===== Credits ===== |
| | |
| | ===== Changelog ===== |
| | |
| | * **2007-04-08** |
| | * Initial release |
| |