| |
— | monitors:lsf_queue [2009/11/23 05:52] (current) – created - external edit 127.0.0.1 |
---|
| ====== lsf_queues ====== |
| |
| ^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] | |
| ^ Compatibility | Xymon 4.2 | |
| ^ Requirements | Perl, unix, Platform LSF | |
| ^ Download | None | |
| ^ Last Update | 2007-04-08 | |
| |
| ===== Description ===== |
| |
| This script is used to report the number of jobs in running/pending/suspended state |
| for each queue in an ncv-compatible fashion, allowing queue graphing. |
| |
| ===== Installation ===== |
| === Client side === |
| * Copy the script on a farm node in client's ext folder (eg; /usr/lib/hobbit/client/ext on linux/debian) with owner hobbit.hobbit and rights 0755 |
| * Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Exemple on a linux/debian host:<code> |
| [lsfqueues] |
| ENVFILE /usr/lib/hobbit/client/etc/hobbitclient.cfg |
| CMD /usr/lib/hobbit/client/ext/lsf_queues.pl |
| INTERVAL 5m |
| </code> |
| |
| === Server side === |
| * In hobbitserver.cfg: |
| - Append "lsf_queues=ncv" to TEST2RRD |
| - Append "lsf_queues" to GRAPHS |
| - Add the following line for NCV:<code> |
| NCV_lsf_queues="*:GAUGE" |
| </code> |
| * You need an entry in one of the bb-hosts file:<code> |
| 1.2.3.4 myfarm # noconn TRENDS:*,lsf_queues:lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues |
| </code>or if you want to combine it with the [[monitors:lsf_mon|lsf_mon script]]:<code> |
| 1.2.3.4 myfarm # noconn TRENDS:*,lsf,lsf_queues:lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues |
| </code> |
| * Edit the hobbitgraph.cfg configuration to add the following entries. Please note that I manually entered an entry for each queue I wanted to display. You will obviously need to change it to adapt it to your setup. |
| <hidden onHidden="Show Code ⇲" onVisible="Hide Code ⇱"> |
| <code> |
| [lsf_pending_queues] |
| DEF:queue1=lsf_queues.rrd:queue1pending:AVERAGE |
| DEF:queue2=lsf_queues.rrd:queue2pending:AVERAGE |
| DEF:queue3=lsf_queues.rrd:queue3pending:AVERAGE |
| DEF:queue4=lsf_queues.rrd:queue4pending:AVERAGE |
| DEF:queue5=lsf_queues.rrd:queue5pending:AVERAGE |
| DEF:queue6=lsf_queues.rrd:queue6pending:AVERAGE |
| DEF:queue7=lsf_queues.rrd:queue7pending:AVERAGE |
| DEF:queue8=lsf_queues.rrd:queue8pending:AVERAGE |
| TITLE LSF queues: number of pending jobs per queue |
| -l 0 |
| YAXIS # |
| AREA:queue1#99ccff:queue1 |
| GPRINT:queue1:LAST: \: %5.1lf (cur) |
| GPRINT:queue1:MAX: \: %5.1lf (max) |
| GPRINT:queue1:MIN: \: %5.1lf (min) |
| GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue2#cc99ff:queue2 |
| GPRINT:queue2:LAST: \: %5.1lf (cur) |
| GPRINT:queue2:MAX: \: %5.1lf (max) |
| GPRINT:queue2:MIN: \: %5.1lf (min) |
| GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue3#9933ff:queue3 |
| GPRINT:queue3:LAST: \: %5.1lf (cur) |
| GPRINT:queue3:MAX: \: %5.1lf (max) |
| GPRINT:queue3:MIN: \: %5.1lf (min) |
| GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue4#3399ff:queue4 |
| GPRINT:queue4:LAST: \: %5.1lf (cur) |
| GPRINT:queue4:MAX: \: %5.1lf (max) |
| GPRINT:queue4:MIN: \: %5.1lf (min) |
| GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue5#ff3333:queue5 |
| GPRINT:queue5:LAST: \: %5.1lf (cur) |
| GPRINT:queue5:MAX: \: %5.1lf (max) |
| GPRINT:queue5:MIN: \: %5.1lf (min) |
| GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue6#ff9933:queue6 |
| GPRINT:queue6:LAST: \: %5.1lf (cur) |
| GPRINT:queue6:MAX: \: %5.1lf (max) |
| GPRINT:queue6:MIN: \: %5.1lf (min) |
| GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue7#00ff00:queue7 |
| GPRINT:queue7:LAST: \: %5.1lf (cur) |
| GPRINT:queue7:MAX: \: %5.1lf (max) |
| GPRINT:queue7:MIN: \: %5.1lf (min) |
| GPRINT:queue7:AVERAGE: \: %5.1lf (avg) |
| |
| [lsf_running_queues] |
| DEF:queue1=lsf_queues.rrd:queue1running:AVERAGE |
| DEF:queue2=lsf_queues.rrd:queue2running:AVERAGE |
| DEF:queue3=lsf_queues.rrd:queue3running:AVERAGE |
| DEF:queue4=lsf_queues.rrd:queue4running:AVERAGE |
| DEF:queue5=lsf_queues.rrd:queue5running:AVERAGE |
| DEF:queue6=lsf_queues.rrd:queue6running:AVERAGE |
| DEF:queue7=lsf_queues.rrd:queue7running:AVERAGE |
| DEF:queue8=lsf_queues.rrd:queue8running:AVERAGE |
| TITLE LSF queues: number of running jobs per queue |
| -l 0 |
| YAXIS # |
| AREA:queue1#99ccff:queue1 |
| GPRINT:queue1:LAST: \: %5.1lf (cur) |
| GPRINT:queue1:MAX: \: %5.1lf (max) |
| GPRINT:queue1:MIN: \: %5.1lf (min) |
| GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue2#cc99ff:queue2 |
| GPRINT:queue2:LAST: \: %5.1lf (cur) |
| GPRINT:queue2:MAX: \: %5.1lf (max) |
| GPRINT:queue2:MIN: \: %5.1lf (min) |
| GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue3#9933ff:queue3 |
| GPRINT:queue3:LAST: \: %5.1lf (cur) |
| GPRINT:queue3:MAX: \: %5.1lf (max) |
| GPRINT:queue3:MIN: \: %5.1lf (min) |
| GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue4#3399ff:queue4 |
| GPRINT:queue4:LAST: \: %5.1lf (cur) |
| GPRINT:queue4:MAX: \: %5.1lf (max) |
| GPRINT:queue4:MIN: \: %5.1lf (min) |
| GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue5#ff3333:queue5 |
| GPRINT:queue5:LAST: \: %5.1lf (cur) |
| GPRINT:queue5:MAX: \: %5.1lf (max) |
| GPRINT:queue5:MIN: \: %5.1lf (min) |
| GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue6#ff9933:queue6 |
| GPRINT:queue6:LAST: \: %5.1lf (cur) |
| GPRINT:queue6:MAX: \: %5.1lf (max) |
| GPRINT:queue6:MIN: \: %5.1lf (min) |
| GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue7#00ff00:queue7 |
| GPRINT:queue7:LAST: \: %5.1lf (cur) |
| GPRINT:queue7:MAX: \: %5.1lf (max) |
| GPRINT:queue7:MIN: \: %5.1lf (min) |
| GPRINT:queue7:AVERAGE: \: %5.1lf (avg) |
| |
| [lsf_suspended_queues] |
| DEF:queue1=lsf_queues.rrd:queue1suspended:AVERAGE |
| DEF:queue2=lsf_queues.rrd:queue2suspended:AVERAGE |
| DEF:queue3=lsf_queues.rrd:queue3suspended:AVERAGE |
| DEF:queue4=lsf_queues.rrd:queue4suspended:AVERAGE |
| DEF:queue5=lsf_queues.rrd:queue5suspended:AVERAGE |
| DEF:queue6=lsf_queues.rrd:queue6suspended:AVERAGE |
| DEF:queue7=lsf_queues.rrd:queue7suspended:AVERAGE |
| DEF:queue8=lsf_queues.rrd:queue8suspended:AVERAGE |
| TITLE LSF queues: number of suspended jobs per queue |
| -l 0 |
| YAXIS # |
| AREA:queue1#99ccff:queue1 |
| GPRINT:queue1:LAST: \: %5.1lf (cur) |
| GPRINT:queue1:MAX: \: %5.1lf (max) |
| GPRINT:queue1:MIN: \: %5.1lf (min) |
| GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue2#cc99ff:queue2 |
| GPRINT:queue2:LAST: \: %5.1lf (cur) |
| GPRINT:queue2:MAX: \: %5.1lf (max) |
| GPRINT:queue2:MIN: \: %5.1lf (min) |
| GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue3#9933ff:queue3 |
| GPRINT:queue3:LAST: \: %5.1lf (cur) |
| GPRINT:queue3:MAX: \: %5.1lf (max) |
| GPRINT:queue3:MIN: \: %5.1lf (min) |
| GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue4#3399ff:queue4 |
| GPRINT:queue4:LAST: \: %5.1lf (cur) |
| GPRINT:queue4:MAX: \: %5.1lf (max) |
| GPRINT:queue4:MIN: \: %5.1lf (min) |
| GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue5#ff3333:queue5 |
| GPRINT:queue5:LAST: \: %5.1lf (cur) |
| GPRINT:queue5:MAX: \: %5.1lf (max) |
| GPRINT:queue5:MIN: \: %5.1lf (min) |
| GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue6#ff9933:queue6 |
| GPRINT:queue6:LAST: \: %5.1lf (cur) |
| GPRINT:queue6:MAX: \: %5.1lf (max) |
| GPRINT:queue6:MIN: \: %5.1lf (min) |
| GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n |
| STACK:queue7#00ff00:queue7 |
| GPRINT:queue7:LAST: \: %5.1lf (cur) |
| GPRINT:queue7:MAX: \: %5.1lf (max) |
| GPRINT:queue7:MIN: \: %5.1lf (min) |
| GPRINT:queue7:AVERAGE: \: %5.1lf (avg) |
| |
| [lsf_queues] |
| DEF:queue1p=lsf_queues.rrd:queue1pending:AVERAGE |
| DEF:queue2p=lsf_queues.rrd:queue2pending:AVERAGE |
| DEF:queue3p=lsf_queues.rrd:queue3pending:AVERAGE |
| DEF:queue4p=lsf_queues.rrd:queue4pending:AVERAGE |
| DEF:queue5p=lsf_queues.rrd:queue5pending:AVERAGE |
| DEF:queue6p=lsf_queues.rrd:queue6pending:AVERAGE |
| DEF:queue7p=lsf_queues.rrd:queue7pending:AVERAGE |
| DEF:queue8p=lsf_queues.rrd:queue8pending:AVERAGE |
| DEF:queue1r=lsf_queues.rrd:queue1running:AVERAGE |
| DEF:queue2r=lsf_queues.rrd:queue2running:AVERAGE |
| DEF:queue3r=lsf_queues.rrd:queue3running:AVERAGE |
| DEF:queue4r=lsf_queues.rrd:queue4running:AVERAGE |
| DEF:queue5r=lsf_queues.rrd:queue5running:AVERAGE |
| DEF:queue6r=lsf_queues.rrd:queue6running:AVERAGE |
| DEF:queue7r=lsf_queues.rrd:queue7running:AVERAGE |
| DEF:queue8r=lsf_queues.rrd:queue8running:AVERAGE |
| DEF:queue1s=lsf_queues.rrd:queue1suspended:AVERAGE |
| DEF:queue2s=lsf_queues.rrd:queue2suspended:AVERAGE |
| DEF:queue3s=lsf_queues.rrd:queue3suspended:AVERAGE |
| DEF:queue4s=lsf_queues.rrd:queue4suspended:AVERAGE |
| DEF:queue5s=lsf_queues.rrd:queue5suspended:AVERAGE |
| DEF:queue6s=lsf_queues.rrd:queue6suspended:AVERAGE |
| DEF:queue7s=lsf_queues.rrd:queue7suspended:AVERAGE |
| DEF:queue8s=lsf_queues.rrd:queue8suspended:AVERAGE |
| DEF:queue8=lsf_queues.rrd:queue8running:AVERAGE |
| |
| CDEF:pending=queue1p,queue2p,+,queue3p,+,queue4p,+,queue5p,+,queue6p,+,queue7p,+,queue8p,+ |
| |
| CDEF:running=queue1r,queue2r,+,queue3r,+,queue4r,+,queue5r,+,queue6r,+,queue7r,+,queue8r,+ |
| |
| CDEF:suspended=queue1s,queue2s,+,queue3s,+,queue4s,+,queue5s,+,queue6s,+,queue7s,+,queue8s,+ |
| TITLE LSF queues |
| -l 0 |
| YAXIS # |
| AREA:running#cc99ff:running |
| GPRINT:running:LAST: \: %5.1lf (cur) |
| GPRINT:running:MAX: \: %5.1lf (max) |
| GPRINT:running:MIN: \: %5.1lf (min) |
| GPRINT:running:AVERAGE: \: %5.1lf (avg)\n |
| STACK:suspended#9933ff:suspended |
| GPRINT:suspended:LAST: \: %5.1lf (cur) |
| GPRINT:suspended:MAX: \: %5.1lf (max) |
| GPRINT:suspended:MIN: \: %5.1lf (min) |
| GPRINT:suspended:AVERAGE: \: %5.1lf (avg)\n |
| STACK:pending#99ccff:pending |
| GPRINT:pending:LAST: \: %5.1lf (cur) |
| GPRINT:pending:MAX: \: %5.1lf (max) |
| GPRINT:pending:MIN: \: %5.1lf (min) |
| </code> |
| </hidden> |
| |
| ===== Source ===== |
| ==== lsf_queues.pl ==== |
| <hidden onHidden="Show Code ⇲" onVisible="Hide Code ⇱"> |
| <code perl> |
| #!/usr/bin/perl -w |
| # |
| # client-side script to monitor queues on a lsf HPC farm |
| # |
| # copyright 2006 - 2007 Genome Research Limited / Gildas Le Nadan |
| # This script is released under the Gnu Public |
| # License (GPL) version 2 and Later |
| |
| my $version = 1.0.0; |
| |
| use strict; |
| |
| ##### PARAMETERS YOU CAN TWEAK |
| |
| # you can debug this script in your environment by setting 1 below and running |
| # BB=echo BBDISP=127.0.0.1 ./lsf_mon.pl on one of your nodes |
| my $DEBUG = 0; # 1 for debug, 0 otherwise |
| |
| # hobbit config |
| my $farm = "farm-login"; # the hostname you want to report under |
| my $hobbitcolumn = "lsf_queues"; # the column name for the test |
| my $color = "green"; # default color |
| my $summary = "LSF queues report"; # title of the test output |
| |
| # how the lsf commands must be run |
| my @bqueues=`. /software/noarch/lsf/conf/profile.lsf && /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/bqueues`; |
| |
| # lines to ignore in the output |
| my $bqueues_banner="QUEUE_NAME PRIO STATUS MAX JL/U JL/P JL/H NJOBS PEND RUN SUSP"; |
| |
| #### INTERNAL PARAMETERS (NO TWEAKING REQUIRED) |
| |
| my $bb = ""; |
| my $bbdisp = ""; |
| |
| my %hqueues = (); |
| |
| ### FUNCTIONS |
| |
| sub print_summary { |
| my $result = "Queues\n"; |
| foreach my $queue ( sort keys %hqueues ) { |
| $result .= sprintf( "%s pending : %4u\n", $queue, $hqueues{$queue}{pend}); |
| $result .= sprintf( "%s running : %4u\n", $queue, $hqueues{$queue}{run}); |
| $result .= sprintf( "%s suspended: %4u\n\n", $queue, $hqueues{$queue}{susp}); |
| } |
| return $result; |
| } |
| |
| sub process_bqueues { |
| foreach my $line ( @bqueues ) { |
| unless ( $line =~ /$bqueues_banner/ ) { |
| my @fields = split( " ", $line ); |
| $hqueues{$fields[0]}{pend} = $fields[8]; |
| $hqueues{$fields[0]}{run} = $fields[9]; |
| $hqueues{$fields[0]}{susp} = $fields[10]; |
| } |
| } |
| } |
| |
| sub send_report { |
| my ( $statusmsg ) = @_; |
| # Build the command we use to send a status to the Xymon daemon |
| my $cmd = $bb . " " . $bbdisp . " \"status " . $farm . "." . $hobbitcolumn . " " . $color . " " . $summary . "\n\n" . $statusmsg . "\""; |
| # And send the message |
| system $cmd; |
| } |
| |
| #### MAIN |
| |
| # Get the BB and BBDISP environment settings. |
| $bb = $ENV{"BB"} || die "BB not defined"; |
| $bbdisp = $ENV{"BBDISP"} || die "BBDISP not defined"; |
| |
| my $statusmsg = ""; |
| |
| process_bqueues; |
| $statusmsg .= print_summary; |
| send_report( $statusmsg ); |
| </code> |
| </hidden> |
| |
| ===== Known Bugs and Issues ===== |
| |
| ===== To Do ===== |
| |
| ===== Credits ===== |
| |
| ===== Changelog ===== |
| |
| * **2007-04-08** |
| * Initial release |
| |