Differences

This shows you the differences between two versions of the page.

Link to this comparison view

monitors:lsf_queue [2009/11/23 05:52] (current)
Line 1: Line 1:
 +====== lsf_queues ======
 +
 +^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] |
 +^ Compatibility | Xymon 4.2 |
 +^ Requirements | Perl, unix, Platform LSF |
 +^ Download | None |
 +^ Last Update | 2007-04-08 |
 +
 +===== Description =====
 +
 +This script is used to report the number of jobs in running/​pending/​suspended state 
 +for each queue in an ncv-compatible fashion, allowing queue graphing.
 +
 +===== Installation =====
 +=== Client side ===
 +  * Copy the script on a farm node in client'​s ext folder (eg; /​usr/​lib/​hobbit/​client/​ext on linux/​debian) with owner hobbit.hobbit and rights 0755
 +  * Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Exemple on a linux/​debian host:<​code>​
 +[lsfqueues]
 +       ​ENVFILE /​usr/​lib/​hobbit/​client/​etc/​hobbitclient.cfg
 +       CMD /​usr/​lib/​hobbit/​client/​ext/​lsf_queues.pl
 +       ​INTERVAL 5m
 +</​code>​
 +
 +=== Server side ===
 +  * In hobbitserver.cfg:​
 +    - Append "​lsf_queues=ncv"​ to TEST2RRD
 +    - Append "​lsf_queues"​ to GRAPHS
 +    - Add the following line for NCV:<​code>​
 +NCV_lsf_queues="​*:​GAUGE"​
 +</​code>​
 +  * You need an entry in one of the bb-hosts file:<​code>​
 +1.2.3.4 ​    ​myfarm ​     # noconn TRENDS:​*,​lsf_queues:​lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues
 +</​code>​or if you want to combine it with the [[monitors:​lsf_mon|lsf_mon script]]:<​code>​
 +1.2.3.4 ​    ​myfarm ​     # noconn TRENDS:​*,​lsf,​lsf_queues:​lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues
 +</​code>​
 +  * Edit the hobbitgraph.cfg configuration to add the following entries. Please note that I manually entered an entry for each queue I wanted to display. You will obviously need to change it to adapt it to your setup.
 +<hidden onHidden="​Show Code ⇲" onVisible="​Hide Code ⇱">​
 +<​code>​
 +[lsf_pending_queues]
 +       ​DEF:​queue1=lsf_queues.rrd:​queue1pending:​AVERAGE
 +       ​DEF:​queue2=lsf_queues.rrd:​queue2pending:​AVERAGE
 +       ​DEF:​queue3=lsf_queues.rrd:​queue3pending:​AVERAGE
 +       ​DEF:​queue4=lsf_queues.rrd:​queue4pending:​AVERAGE
 +       ​DEF:​queue5=lsf_queues.rrd:​queue5pending:​AVERAGE
 +       ​DEF:​queue6=lsf_queues.rrd:​queue6pending:​AVERAGE
 +       ​DEF:​queue7=lsf_queues.rrd:​queue7pending:​AVERAGE
 +       ​DEF:​queue8=lsf_queues.rrd:​queue8pending:​AVERAGE
 +       TITLE LSF queues: number of pending jobs per queue
 +       -l 0
 +       YAXIS #
 +       ​AREA:​queue1#​99ccff:​queue1
 +       ​GPRINT:​queue1:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue1:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue1:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue1:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue2#​cc99ff:​queue2
 +       ​GPRINT:​queue2:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue2:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue2:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue2:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue3#​9933ff:​queue3
 +       ​GPRINT:​queue3:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue3:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue3:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue3:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue4#​3399ff:​queue4
 +       ​GPRINT:​queue4:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue4:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue4:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue4:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue5#​ff3333:​queue5
 +       ​GPRINT:​queue5:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue5:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue5:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue5:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue6#​ff9933:​queue6
 +       ​GPRINT:​queue6:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue6:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue6:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue6:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue7#​00ff00:​queue7
 +       ​GPRINT:​queue7:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue7:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue7:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue7:​AVERAGE:​ \: %5.1lf (avg)
 +
 +[lsf_running_queues]
 +       ​DEF:​queue1=lsf_queues.rrd:​queue1running:​AVERAGE
 +       ​DEF:​queue2=lsf_queues.rrd:​queue2running:​AVERAGE
 +       ​DEF:​queue3=lsf_queues.rrd:​queue3running:​AVERAGE
 +       ​DEF:​queue4=lsf_queues.rrd:​queue4running:​AVERAGE
 +       ​DEF:​queue5=lsf_queues.rrd:​queue5running:​AVERAGE
 +       ​DEF:​queue6=lsf_queues.rrd:​queue6running:​AVERAGE
 +       ​DEF:​queue7=lsf_queues.rrd:​queue7running:​AVERAGE
 +       ​DEF:​queue8=lsf_queues.rrd:​queue8running:​AVERAGE
 +       TITLE LSF queues: number of running jobs per queue
 +       -l 0
 +       YAXIS #
 +       ​AREA:​queue1#​99ccff:​queue1
 +       ​GPRINT:​queue1:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue1:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue1:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue1:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue2#​cc99ff:​queue2
 +       ​GPRINT:​queue2:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue2:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue2:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue2:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue3#​9933ff:​queue3
 +       ​GPRINT:​queue3:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue3:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue3:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue3:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue4#​3399ff:​queue4
 +       ​GPRINT:​queue4:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue4:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue4:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue4:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue5#​ff3333:​queue5
 +       ​GPRINT:​queue5:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue5:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue5:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue5:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue6#​ff9933:​queue6
 +       ​GPRINT:​queue6:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue6:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue6:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue6:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue7#​00ff00:​queue7
 +       ​GPRINT:​queue7:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue7:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue7:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue7:​AVERAGE:​ \: %5.1lf (avg)
 +
 +[lsf_suspended_queues]
 +       ​DEF:​queue1=lsf_queues.rrd:​queue1suspended:​AVERAGE
 +       ​DEF:​queue2=lsf_queues.rrd:​queue2suspended:​AVERAGE
 +       ​DEF:​queue3=lsf_queues.rrd:​queue3suspended:​AVERAGE
 +       ​DEF:​queue4=lsf_queues.rrd:​queue4suspended:​AVERAGE
 +       ​DEF:​queue5=lsf_queues.rrd:​queue5suspended:​AVERAGE
 +       ​DEF:​queue6=lsf_queues.rrd:​queue6suspended:​AVERAGE
 +       ​DEF:​queue7=lsf_queues.rrd:​queue7suspended:​AVERAGE
 +       ​DEF:​queue8=lsf_queues.rrd:​queue8suspended:​AVERAGE
 +       TITLE LSF queues: number of suspended jobs per queue
 +       -l 0
 +       YAXIS #
 +       ​AREA:​queue1#​99ccff:​queue1
 +       ​GPRINT:​queue1:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue1:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue1:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue1:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue2#​cc99ff:​queue2
 +       ​GPRINT:​queue2:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue2:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue2:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue2:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue3#​9933ff:​queue3
 +       ​GPRINT:​queue3:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue3:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue3:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue3:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue4#​3399ff:​queue4
 +       ​GPRINT:​queue4:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue4:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue4:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue4:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue5#​ff3333:​queue5
 +       ​GPRINT:​queue5:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue5:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue5:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue5:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue6#​ff9933:​queue6
 +       ​GPRINT:​queue6:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue6:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue6:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue6:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​queue7#​00ff00:​queue7
 +       ​GPRINT:​queue7:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​queue7:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​queue7:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​queue7:​AVERAGE:​ \: %5.1lf (avg)
 +
 +[lsf_queues]
 +       ​DEF:​queue1p=lsf_queues.rrd:​queue1pending:​AVERAGE
 +       ​DEF:​queue2p=lsf_queues.rrd:​queue2pending:​AVERAGE
 +       ​DEF:​queue3p=lsf_queues.rrd:​queue3pending:​AVERAGE
 +       ​DEF:​queue4p=lsf_queues.rrd:​queue4pending:​AVERAGE
 +       ​DEF:​queue5p=lsf_queues.rrd:​queue5pending:​AVERAGE
 +       ​DEF:​queue6p=lsf_queues.rrd:​queue6pending:​AVERAGE
 +       ​DEF:​queue7p=lsf_queues.rrd:​queue7pending:​AVERAGE
 +       ​DEF:​queue8p=lsf_queues.rrd:​queue8pending:​AVERAGE
 +       ​DEF:​queue1r=lsf_queues.rrd:​queue1running:​AVERAGE
 +       ​DEF:​queue2r=lsf_queues.rrd:​queue2running:​AVERAGE
 +       ​DEF:​queue3r=lsf_queues.rrd:​queue3running:​AVERAGE
 +       ​DEF:​queue4r=lsf_queues.rrd:​queue4running:​AVERAGE
 +       ​DEF:​queue5r=lsf_queues.rrd:​queue5running:​AVERAGE
 +       ​DEF:​queue6r=lsf_queues.rrd:​queue6running:​AVERAGE
 +       ​DEF:​queue7r=lsf_queues.rrd:​queue7running:​AVERAGE
 +       ​DEF:​queue8r=lsf_queues.rrd:​queue8running:​AVERAGE
 +       ​DEF:​queue1s=lsf_queues.rrd:​queue1suspended:​AVERAGE
 +       ​DEF:​queue2s=lsf_queues.rrd:​queue2suspended:​AVERAGE
 +       ​DEF:​queue3s=lsf_queues.rrd:​queue3suspended:​AVERAGE
 +       ​DEF:​queue4s=lsf_queues.rrd:​queue4suspended:​AVERAGE
 +       ​DEF:​queue5s=lsf_queues.rrd:​queue5suspended:​AVERAGE
 +       ​DEF:​queue6s=lsf_queues.rrd:​queue6suspended:​AVERAGE
 +       ​DEF:​queue7s=lsf_queues.rrd:​queue7suspended:​AVERAGE
 +       ​DEF:​queue8s=lsf_queues.rrd:​queue8suspended:​AVERAGE
 +       ​DEF:​queue8=lsf_queues.rrd:​queue8running:​AVERAGE
 +
 +CDEF:​pending=queue1p,​queue2p,​+,​queue3p,​+,​queue4p,​+,​queue5p,​+,​queue6p,​+,​queue7p,​+,​queue8p,​+
 +
 +CDEF:​running=queue1r,​queue2r,​+,​queue3r,​+,​queue4r,​+,​queue5r,​+,​queue6r,​+,​queue7r,​+,​queue8r,​+
 +
 +CDEF:​suspended=queue1s,​queue2s,​+,​queue3s,​+,​queue4s,​+,​queue5s,​+,​queue6s,​+,​queue7s,​+,​queue8s,​+
 +       TITLE LSF queues
 +       -l 0
 +       YAXIS #
 +       ​AREA:​running#​cc99ff:​running
 +       ​GPRINT:​running:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​running:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​running:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​running:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​suspended#​9933ff:​suspended
 +       ​GPRINT:​suspended:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​suspended:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​suspended:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​suspended:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​pending#​99ccff:​pending
 +       ​GPRINT:​pending:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​pending:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​pending:​MIN:​ \: %5.1lf (min)
 +</​code>​
 +</​hidden>​
 +
 +===== Source =====
 +==== lsf_queues.pl ====
 +<hidden onHidden="​Show Code ⇲" onVisible="​Hide Code ⇱">​
 +<code perl>
 +#​!/​usr/​bin/​perl -w
 +#
 +# client-side script to monitor queues on a lsf HPC farm
 +#
 +# copyright 2006 - 2007  Genome Research Limited / Gildas Le Nadan
 +# This script is released under the Gnu Public
 +# License (GPL) version 2 and Later
 +
 +my $version = 1.0.0;
 +
 +use strict;
 +
 +##### PARAMETERS YOU CAN TWEAK
 +
 +# you can debug this script in your environment by setting 1 below and running
 +# BB=echo BBDISP=127.0.0.1 ./​lsf_mon.pl on one of your nodes
 +my $DEBUG = 0;  # 1 for debug, 0 otherwise
 +
 +# hobbit config
 +my $farm = "​farm-login";​ #​ the hostname you want to report under
 +my $hobbitcolumn = "​lsf_queues";​ #​ the column name for the test
 +my $color = "​green";​ #​ default color
 +my $summary = "LSF queues report";​ #​ title of the test output
 +
 +# how the lsf commands must be run
 +my @bqueues=`. /​software/​noarch/​lsf/​conf/​profile.lsf && ​ /​usr/​local/​lsf/​6.1/​linux2.6-glibc2.3-amd64/​bin/​bqueues`;​
 +
 +# lines to ignore in the output
 +my $bqueues_banner="​QUEUE_NAME ​     PRIO STATUS ​         MAX JL/U JL/P JL/H NJOBS  PEND   ​RUN ​ SUSP";
 +
 +#### INTERNAL PARAMETERS (NO TWEAKING REQUIRED)
 +
 +my $bb = "";​
 +my $bbdisp = "";​
 +
 +my %hqueues = ();
 +
 +### FUNCTIONS
 +
 +sub print_summary {
 + my $result = "​Queues\n";​
 + foreach my $queue ( sort keys %hqueues ) {
 + $result .= sprintf( "%s pending ​ : %4u\n",​ $queue, $hqueues{$queue}{pend});​
 + $result .= sprintf( "%s running ​ : %4u\n",​ $queue, $hqueues{$queue}{run});​
 + $result .= sprintf( "%s suspended: %4u\n\n",​ $queue, $hqueues{$queue}{susp});​
 + }
 + return $result;
 +}
 +
 +sub process_bqueues {
 + foreach my $line ( @bqueues ) {
 + unless ( $line =~ /​$bqueues_banner/​ ) {
 + my @fields = split( " ", $line );
 + $hqueues{$fields[0]}{pend} = $fields[8];
 + $hqueues{$fields[0]}{run} = $fields[9];
 + $hqueues{$fields[0]}{susp} = $fields[10];​
 + }
 + }
 +}
 +
 +sub send_report {
 + my ( $statusmsg ) = @_;
 + # Build the command we use to send a status to the Xymon daemon
 + my $cmd = $bb . " " . $bbdisp . " \"​status " . $farm . "​."​ . $hobbitcolumn . " " . $color . " " . $summary . "​\n\n"​ . $statusmsg . "​\"";​
 + # And send the message
 + system $cmd;
 +}
 +
 +#### MAIN
 +
 +# Get the BB and BBDISP environment settings.
 +$bb = $ENV{"​BB"​} || die "BB not defined";​
 +$bbdisp = $ENV{"​BBDISP"​} || die "​BBDISP not defined";​
 +
 +my $statusmsg = "";​
 +
 +process_bqueues;​
 +$statusmsg .= print_summary;​
 +send_report( $statusmsg );
 +</​code>​
 +</​hidden>​
 +
 +===== Known  Bugs and Issues =====
 +
 +===== To Do =====
 +
 +===== Credits =====
 +
 +===== Changelog =====
 +
 +  * **2007-04-08**
 +    * Initial release
  
  • monitors/lsf_queue.txt
  • Last modified: 2009/11/23 05:52
  • (external edit)