====== lsf_queues ====== ^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] | ^ Compatibility | Xymon 4.2 | ^ Requirements | Perl, unix, Platform LSF | ^ Download | None | ^ Last Update | 2007-04-08 | ===== Description ===== This script is used to report the number of jobs in running/pending/suspended state for each queue in an ncv-compatible fashion, allowing queue graphing. ===== Installation ===== === Client side === * Copy the script on a farm node in client's ext folder (eg; /usr/lib/hobbit/client/ext on linux/debian) with owner hobbit.hobbit and rights 0755 * Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Exemple on a linux/debian host: [lsfqueues] ENVFILE /usr/lib/hobbit/client/etc/hobbitclient.cfg CMD /usr/lib/hobbit/client/ext/lsf_queues.pl INTERVAL 5m === Server side === * In hobbitserver.cfg: - Append "lsf_queues=ncv" to TEST2RRD - Append "lsf_queues" to GRAPHS - Add the following line for NCV: NCV_lsf_queues="*:GAUGE" * You need an entry in one of the bb-hosts file: 1.2.3.4 myfarm # noconn TRENDS:*,lsf_queues:lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues or if you want to combine it with the [[monitors:lsf_mon|lsf_mon script]]: 1.2.3.4 myfarm # noconn TRENDS:*,lsf,lsf_queues:lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues * Edit the hobbitgraph.cfg configuration to add the following entries. Please note that I manually entered an entry for each queue I wanted to display. You will obviously need to change it to adapt it to your setup. [lsf_pending_queues] DEF:queue1=lsf_queues.rrd:queue1pending:AVERAGE DEF:queue2=lsf_queues.rrd:queue2pending:AVERAGE DEF:queue3=lsf_queues.rrd:queue3pending:AVERAGE DEF:queue4=lsf_queues.rrd:queue4pending:AVERAGE DEF:queue5=lsf_queues.rrd:queue5pending:AVERAGE DEF:queue6=lsf_queues.rrd:queue6pending:AVERAGE DEF:queue7=lsf_queues.rrd:queue7pending:AVERAGE DEF:queue8=lsf_queues.rrd:queue8pending:AVERAGE TITLE LSF queues: number of pending jobs per queue -l 0 YAXIS # AREA:queue1#99ccff:queue1 GPRINT:queue1:LAST: \: %5.1lf (cur) GPRINT:queue1:MAX: \: %5.1lf (max) GPRINT:queue1:MIN: \: %5.1lf (min) GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n STACK:queue2#cc99ff:queue2 GPRINT:queue2:LAST: \: %5.1lf (cur) GPRINT:queue2:MAX: \: %5.1lf (max) GPRINT:queue2:MIN: \: %5.1lf (min) GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n STACK:queue3#9933ff:queue3 GPRINT:queue3:LAST: \: %5.1lf (cur) GPRINT:queue3:MAX: \: %5.1lf (max) GPRINT:queue3:MIN: \: %5.1lf (min) GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n STACK:queue4#3399ff:queue4 GPRINT:queue4:LAST: \: %5.1lf (cur) GPRINT:queue4:MAX: \: %5.1lf (max) GPRINT:queue4:MIN: \: %5.1lf (min) GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n STACK:queue5#ff3333:queue5 GPRINT:queue5:LAST: \: %5.1lf (cur) GPRINT:queue5:MAX: \: %5.1lf (max) GPRINT:queue5:MIN: \: %5.1lf (min) GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n STACK:queue6#ff9933:queue6 GPRINT:queue6:LAST: \: %5.1lf (cur) GPRINT:queue6:MAX: \: %5.1lf (max) GPRINT:queue6:MIN: \: %5.1lf (min) GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n STACK:queue7#00ff00:queue7 GPRINT:queue7:LAST: \: %5.1lf (cur) GPRINT:queue7:MAX: \: %5.1lf (max) GPRINT:queue7:MIN: \: %5.1lf (min) GPRINT:queue7:AVERAGE: \: %5.1lf (avg) [lsf_running_queues] DEF:queue1=lsf_queues.rrd:queue1running:AVERAGE DEF:queue2=lsf_queues.rrd:queue2running:AVERAGE DEF:queue3=lsf_queues.rrd:queue3running:AVERAGE DEF:queue4=lsf_queues.rrd:queue4running:AVERAGE DEF:queue5=lsf_queues.rrd:queue5running:AVERAGE DEF:queue6=lsf_queues.rrd:queue6running:AVERAGE DEF:queue7=lsf_queues.rrd:queue7running:AVERAGE DEF:queue8=lsf_queues.rrd:queue8running:AVERAGE TITLE LSF queues: number of running jobs per queue -l 0 YAXIS # AREA:queue1#99ccff:queue1 GPRINT:queue1:LAST: \: %5.1lf (cur) GPRINT:queue1:MAX: \: %5.1lf (max) GPRINT:queue1:MIN: \: %5.1lf (min) GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n STACK:queue2#cc99ff:queue2 GPRINT:queue2:LAST: \: %5.1lf (cur) GPRINT:queue2:MAX: \: %5.1lf (max) GPRINT:queue2:MIN: \: %5.1lf (min) GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n STACK:queue3#9933ff:queue3 GPRINT:queue3:LAST: \: %5.1lf (cur) GPRINT:queue3:MAX: \: %5.1lf (max) GPRINT:queue3:MIN: \: %5.1lf (min) GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n STACK:queue4#3399ff:queue4 GPRINT:queue4:LAST: \: %5.1lf (cur) GPRINT:queue4:MAX: \: %5.1lf (max) GPRINT:queue4:MIN: \: %5.1lf (min) GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n STACK:queue5#ff3333:queue5 GPRINT:queue5:LAST: \: %5.1lf (cur) GPRINT:queue5:MAX: \: %5.1lf (max) GPRINT:queue5:MIN: \: %5.1lf (min) GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n STACK:queue6#ff9933:queue6 GPRINT:queue6:LAST: \: %5.1lf (cur) GPRINT:queue6:MAX: \: %5.1lf (max) GPRINT:queue6:MIN: \: %5.1lf (min) GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n STACK:queue7#00ff00:queue7 GPRINT:queue7:LAST: \: %5.1lf (cur) GPRINT:queue7:MAX: \: %5.1lf (max) GPRINT:queue7:MIN: \: %5.1lf (min) GPRINT:queue7:AVERAGE: \: %5.1lf (avg) [lsf_suspended_queues] DEF:queue1=lsf_queues.rrd:queue1suspended:AVERAGE DEF:queue2=lsf_queues.rrd:queue2suspended:AVERAGE DEF:queue3=lsf_queues.rrd:queue3suspended:AVERAGE DEF:queue4=lsf_queues.rrd:queue4suspended:AVERAGE DEF:queue5=lsf_queues.rrd:queue5suspended:AVERAGE DEF:queue6=lsf_queues.rrd:queue6suspended:AVERAGE DEF:queue7=lsf_queues.rrd:queue7suspended:AVERAGE DEF:queue8=lsf_queues.rrd:queue8suspended:AVERAGE TITLE LSF queues: number of suspended jobs per queue -l 0 YAXIS # AREA:queue1#99ccff:queue1 GPRINT:queue1:LAST: \: %5.1lf (cur) GPRINT:queue1:MAX: \: %5.1lf (max) GPRINT:queue1:MIN: \: %5.1lf (min) GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n STACK:queue2#cc99ff:queue2 GPRINT:queue2:LAST: \: %5.1lf (cur) GPRINT:queue2:MAX: \: %5.1lf (max) GPRINT:queue2:MIN: \: %5.1lf (min) GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n STACK:queue3#9933ff:queue3 GPRINT:queue3:LAST: \: %5.1lf (cur) GPRINT:queue3:MAX: \: %5.1lf (max) GPRINT:queue3:MIN: \: %5.1lf (min) GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n STACK:queue4#3399ff:queue4 GPRINT:queue4:LAST: \: %5.1lf (cur) GPRINT:queue4:MAX: \: %5.1lf (max) GPRINT:queue4:MIN: \: %5.1lf (min) GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n STACK:queue5#ff3333:queue5 GPRINT:queue5:LAST: \: %5.1lf (cur) GPRINT:queue5:MAX: \: %5.1lf (max) GPRINT:queue5:MIN: \: %5.1lf (min) GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n STACK:queue6#ff9933:queue6 GPRINT:queue6:LAST: \: %5.1lf (cur) GPRINT:queue6:MAX: \: %5.1lf (max) GPRINT:queue6:MIN: \: %5.1lf (min) GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n STACK:queue7#00ff00:queue7 GPRINT:queue7:LAST: \: %5.1lf (cur) GPRINT:queue7:MAX: \: %5.1lf (max) GPRINT:queue7:MIN: \: %5.1lf (min) GPRINT:queue7:AVERAGE: \: %5.1lf (avg) [lsf_queues] DEF:queue1p=lsf_queues.rrd:queue1pending:AVERAGE DEF:queue2p=lsf_queues.rrd:queue2pending:AVERAGE DEF:queue3p=lsf_queues.rrd:queue3pending:AVERAGE DEF:queue4p=lsf_queues.rrd:queue4pending:AVERAGE DEF:queue5p=lsf_queues.rrd:queue5pending:AVERAGE DEF:queue6p=lsf_queues.rrd:queue6pending:AVERAGE DEF:queue7p=lsf_queues.rrd:queue7pending:AVERAGE DEF:queue8p=lsf_queues.rrd:queue8pending:AVERAGE DEF:queue1r=lsf_queues.rrd:queue1running:AVERAGE DEF:queue2r=lsf_queues.rrd:queue2running:AVERAGE DEF:queue3r=lsf_queues.rrd:queue3running:AVERAGE DEF:queue4r=lsf_queues.rrd:queue4running:AVERAGE DEF:queue5r=lsf_queues.rrd:queue5running:AVERAGE DEF:queue6r=lsf_queues.rrd:queue6running:AVERAGE DEF:queue7r=lsf_queues.rrd:queue7running:AVERAGE DEF:queue8r=lsf_queues.rrd:queue8running:AVERAGE DEF:queue1s=lsf_queues.rrd:queue1suspended:AVERAGE DEF:queue2s=lsf_queues.rrd:queue2suspended:AVERAGE DEF:queue3s=lsf_queues.rrd:queue3suspended:AVERAGE DEF:queue4s=lsf_queues.rrd:queue4suspended:AVERAGE DEF:queue5s=lsf_queues.rrd:queue5suspended:AVERAGE DEF:queue6s=lsf_queues.rrd:queue6suspended:AVERAGE DEF:queue7s=lsf_queues.rrd:queue7suspended:AVERAGE DEF:queue8s=lsf_queues.rrd:queue8suspended:AVERAGE DEF:queue8=lsf_queues.rrd:queue8running:AVERAGE CDEF:pending=queue1p,queue2p,+,queue3p,+,queue4p,+,queue5p,+,queue6p,+,queue7p,+,queue8p,+ CDEF:running=queue1r,queue2r,+,queue3r,+,queue4r,+,queue5r,+,queue6r,+,queue7r,+,queue8r,+ CDEF:suspended=queue1s,queue2s,+,queue3s,+,queue4s,+,queue5s,+,queue6s,+,queue7s,+,queue8s,+ TITLE LSF queues -l 0 YAXIS # AREA:running#cc99ff:running GPRINT:running:LAST: \: %5.1lf (cur) GPRINT:running:MAX: \: %5.1lf (max) GPRINT:running:MIN: \: %5.1lf (min) GPRINT:running:AVERAGE: \: %5.1lf (avg)\n STACK:suspended#9933ff:suspended GPRINT:suspended:LAST: \: %5.1lf (cur) GPRINT:suspended:MAX: \: %5.1lf (max) GPRINT:suspended:MIN: \: %5.1lf (min) GPRINT:suspended:AVERAGE: \: %5.1lf (avg)\n STACK:pending#99ccff:pending GPRINT:pending:LAST: \: %5.1lf (cur) GPRINT:pending:MAX: \: %5.1lf (max) GPRINT:pending:MIN: \: %5.1lf (min) ===== Source ===== ==== lsf_queues.pl ==== #!/usr/bin/perl -w # # client-side script to monitor queues on a lsf HPC farm # # copyright 2006 - 2007 Genome Research Limited / Gildas Le Nadan # This script is released under the Gnu Public # License (GPL) version 2 and Later my $version = 1.0.0; use strict; ##### PARAMETERS YOU CAN TWEAK # you can debug this script in your environment by setting 1 below and running # BB=echo BBDISP=127.0.0.1 ./lsf_mon.pl on one of your nodes my $DEBUG = 0; # 1 for debug, 0 otherwise # hobbit config my $farm = "farm-login"; # the hostname you want to report under my $hobbitcolumn = "lsf_queues"; # the column name for the test my $color = "green"; # default color my $summary = "LSF queues report"; # title of the test output # how the lsf commands must be run my @bqueues=`. /software/noarch/lsf/conf/profile.lsf && /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/bqueues`; # lines to ignore in the output my $bqueues_banner="QUEUE_NAME PRIO STATUS MAX JL/U JL/P JL/H NJOBS PEND RUN SUSP"; #### INTERNAL PARAMETERS (NO TWEAKING REQUIRED) my $bb = ""; my $bbdisp = ""; my %hqueues = (); ### FUNCTIONS sub print_summary { my $result = "Queues\n"; foreach my $queue ( sort keys %hqueues ) { $result .= sprintf( "%s pending : %4u\n", $queue, $hqueues{$queue}{pend}); $result .= sprintf( "%s running : %4u\n", $queue, $hqueues{$queue}{run}); $result .= sprintf( "%s suspended: %4u\n\n", $queue, $hqueues{$queue}{susp}); } return $result; } sub process_bqueues { foreach my $line ( @bqueues ) { unless ( $line =~ /$bqueues_banner/ ) { my @fields = split( " ", $line ); $hqueues{$fields[0]}{pend} = $fields[8]; $hqueues{$fields[0]}{run} = $fields[9]; $hqueues{$fields[0]}{susp} = $fields[10]; } } } sub send_report { my ( $statusmsg ) = @_; # Build the command we use to send a status to the Xymon daemon my $cmd = $bb . " " . $bbdisp . " \"status " . $farm . "." . $hobbitcolumn . " " . $color . " " . $summary . "\n\n" . $statusmsg . "\""; # And send the message system $cmd; } #### MAIN # Get the BB and BBDISP environment settings. $bb = $ENV{"BB"} || die "BB not defined"; $bbdisp = $ENV{"BBDISP"} || die "BBDISP not defined"; my $statusmsg = ""; process_bqueues; $statusmsg .= print_summary; send_report( $statusmsg ); ===== Known Bugs and Issues ===== ===== To Do ===== ===== Credits ===== ===== Changelog ===== * **2007-04-08** * Initial release