monitors:lsf_queue

no way to compare when less than two revisions

Differences

This shows you the differences between two versions of the page.


monitors:lsf_queue [2009/11/23 05:52] (current) – created - external edit 127.0.0.1
Line 1: Line 1:
 +====== lsf_queues ======
 +
 +^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] |
 +^ Compatibility | Xymon 4.2 |
 +^ Requirements | Perl, unix, Platform LSF |
 +^ Download | None |
 +^ Last Update | 2007-04-08 |
 +
 +===== Description =====
 +
 +This script is used to report the number of jobs in running/pending/suspended state 
 +for each queue in an ncv-compatible fashion, allowing queue graphing.
 +
 +===== Installation =====
 +=== Client side ===
 +  * Copy the script on a farm node in client's ext folder (eg; /usr/lib/hobbit/client/ext on linux/debian) with owner hobbit.hobbit and rights 0755
 +  * Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Exemple on a linux/debian host:<code>
 +[lsfqueues]
 +       ENVFILE /usr/lib/hobbit/client/etc/hobbitclient.cfg
 +       CMD /usr/lib/hobbit/client/ext/lsf_queues.pl
 +       INTERVAL 5m
 +</code>
 +
 +=== Server side ===
 +  * In hobbitserver.cfg:
 +    - Append "lsf_queues=ncv" to TEST2RRD
 +    - Append "lsf_queues" to GRAPHS
 +    - Add the following line for NCV:<code>
 +NCV_lsf_queues="*:GAUGE"
 +</code>
 +  * You need an entry in one of the bb-hosts file:<code>
 +1.2.3.4     myfarm      # noconn TRENDS:*,lsf_queues:lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues
 +</code>or if you want to combine it with the [[monitors:lsf_mon|lsf_mon script]]:<code>
 +1.2.3.4     myfarm      # noconn TRENDS:*,lsf,lsf_queues:lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues
 +</code>
 +  * Edit the hobbitgraph.cfg configuration to add the following entries. Please note that I manually entered an entry for each queue I wanted to display. You will obviously need to change it to adapt it to your setup.
 +<hidden onHidden="Show Code ⇲" onVisible="Hide Code ⇱">
 +<code>
 +[lsf_pending_queues]
 +       DEF:queue1=lsf_queues.rrd:queue1pending:AVERAGE
 +       DEF:queue2=lsf_queues.rrd:queue2pending:AVERAGE
 +       DEF:queue3=lsf_queues.rrd:queue3pending:AVERAGE
 +       DEF:queue4=lsf_queues.rrd:queue4pending:AVERAGE
 +       DEF:queue5=lsf_queues.rrd:queue5pending:AVERAGE
 +       DEF:queue6=lsf_queues.rrd:queue6pending:AVERAGE
 +       DEF:queue7=lsf_queues.rrd:queue7pending:AVERAGE
 +       DEF:queue8=lsf_queues.rrd:queue8pending:AVERAGE
 +       TITLE LSF queues: number of pending jobs per queue
 +       -l 0
 +       YAXIS #
 +       AREA:queue1#99ccff:queue1
 +       GPRINT:queue1:LAST: \: %5.1lf (cur)
 +       GPRINT:queue1:MAX: \: %5.1lf (max)
 +       GPRINT:queue1:MIN: \: %5.1lf (min)
 +       GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue2#cc99ff:queue2
 +       GPRINT:queue2:LAST: \: %5.1lf (cur)
 +       GPRINT:queue2:MAX: \: %5.1lf (max)
 +       GPRINT:queue2:MIN: \: %5.1lf (min)
 +       GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue3#9933ff:queue3
 +       GPRINT:queue3:LAST: \: %5.1lf (cur)
 +       GPRINT:queue3:MAX: \: %5.1lf (max)
 +       GPRINT:queue3:MIN: \: %5.1lf (min)
 +       GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue4#3399ff:queue4
 +       GPRINT:queue4:LAST: \: %5.1lf (cur)
 +       GPRINT:queue4:MAX: \: %5.1lf (max)
 +       GPRINT:queue4:MIN: \: %5.1lf (min)
 +       GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue5#ff3333:queue5
 +       GPRINT:queue5:LAST: \: %5.1lf (cur)
 +       GPRINT:queue5:MAX: \: %5.1lf (max)
 +       GPRINT:queue5:MIN: \: %5.1lf (min)
 +       GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue6#ff9933:queue6
 +       GPRINT:queue6:LAST: \: %5.1lf (cur)
 +       GPRINT:queue6:MAX: \: %5.1lf (max)
 +       GPRINT:queue6:MIN: \: %5.1lf (min)
 +       GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue7#00ff00:queue7
 +       GPRINT:queue7:LAST: \: %5.1lf (cur)
 +       GPRINT:queue7:MAX: \: %5.1lf (max)
 +       GPRINT:queue7:MIN: \: %5.1lf (min)
 +       GPRINT:queue7:AVERAGE: \: %5.1lf (avg)
 +
 +[lsf_running_queues]
 +       DEF:queue1=lsf_queues.rrd:queue1running:AVERAGE
 +       DEF:queue2=lsf_queues.rrd:queue2running:AVERAGE
 +       DEF:queue3=lsf_queues.rrd:queue3running:AVERAGE
 +       DEF:queue4=lsf_queues.rrd:queue4running:AVERAGE
 +       DEF:queue5=lsf_queues.rrd:queue5running:AVERAGE
 +       DEF:queue6=lsf_queues.rrd:queue6running:AVERAGE
 +       DEF:queue7=lsf_queues.rrd:queue7running:AVERAGE
 +       DEF:queue8=lsf_queues.rrd:queue8running:AVERAGE
 +       TITLE LSF queues: number of running jobs per queue
 +       -l 0
 +       YAXIS #
 +       AREA:queue1#99ccff:queue1
 +       GPRINT:queue1:LAST: \: %5.1lf (cur)
 +       GPRINT:queue1:MAX: \: %5.1lf (max)
 +       GPRINT:queue1:MIN: \: %5.1lf (min)
 +       GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue2#cc99ff:queue2
 +       GPRINT:queue2:LAST: \: %5.1lf (cur)
 +       GPRINT:queue2:MAX: \: %5.1lf (max)
 +       GPRINT:queue2:MIN: \: %5.1lf (min)
 +       GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue3#9933ff:queue3
 +       GPRINT:queue3:LAST: \: %5.1lf (cur)
 +       GPRINT:queue3:MAX: \: %5.1lf (max)
 +       GPRINT:queue3:MIN: \: %5.1lf (min)
 +       GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue4#3399ff:queue4
 +       GPRINT:queue4:LAST: \: %5.1lf (cur)
 +       GPRINT:queue4:MAX: \: %5.1lf (max)
 +       GPRINT:queue4:MIN: \: %5.1lf (min)
 +       GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue5#ff3333:queue5
 +       GPRINT:queue5:LAST: \: %5.1lf (cur)
 +       GPRINT:queue5:MAX: \: %5.1lf (max)
 +       GPRINT:queue5:MIN: \: %5.1lf (min)
 +       GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue6#ff9933:queue6
 +       GPRINT:queue6:LAST: \: %5.1lf (cur)
 +       GPRINT:queue6:MAX: \: %5.1lf (max)
 +       GPRINT:queue6:MIN: \: %5.1lf (min)
 +       GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue7#00ff00:queue7
 +       GPRINT:queue7:LAST: \: %5.1lf (cur)
 +       GPRINT:queue7:MAX: \: %5.1lf (max)
 +       GPRINT:queue7:MIN: \: %5.1lf (min)
 +       GPRINT:queue7:AVERAGE: \: %5.1lf (avg)
 +
 +[lsf_suspended_queues]
 +       DEF:queue1=lsf_queues.rrd:queue1suspended:AVERAGE
 +       DEF:queue2=lsf_queues.rrd:queue2suspended:AVERAGE
 +       DEF:queue3=lsf_queues.rrd:queue3suspended:AVERAGE
 +       DEF:queue4=lsf_queues.rrd:queue4suspended:AVERAGE
 +       DEF:queue5=lsf_queues.rrd:queue5suspended:AVERAGE
 +       DEF:queue6=lsf_queues.rrd:queue6suspended:AVERAGE
 +       DEF:queue7=lsf_queues.rrd:queue7suspended:AVERAGE
 +       DEF:queue8=lsf_queues.rrd:queue8suspended:AVERAGE
 +       TITLE LSF queues: number of suspended jobs per queue
 +       -l 0
 +       YAXIS #
 +       AREA:queue1#99ccff:queue1
 +       GPRINT:queue1:LAST: \: %5.1lf (cur)
 +       GPRINT:queue1:MAX: \: %5.1lf (max)
 +       GPRINT:queue1:MIN: \: %5.1lf (min)
 +       GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue2#cc99ff:queue2
 +       GPRINT:queue2:LAST: \: %5.1lf (cur)
 +       GPRINT:queue2:MAX: \: %5.1lf (max)
 +       GPRINT:queue2:MIN: \: %5.1lf (min)
 +       GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue3#9933ff:queue3
 +       GPRINT:queue3:LAST: \: %5.1lf (cur)
 +       GPRINT:queue3:MAX: \: %5.1lf (max)
 +       GPRINT:queue3:MIN: \: %5.1lf (min)
 +       GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue4#3399ff:queue4
 +       GPRINT:queue4:LAST: \: %5.1lf (cur)
 +       GPRINT:queue4:MAX: \: %5.1lf (max)
 +       GPRINT:queue4:MIN: \: %5.1lf (min)
 +       GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue5#ff3333:queue5
 +       GPRINT:queue5:LAST: \: %5.1lf (cur)
 +       GPRINT:queue5:MAX: \: %5.1lf (max)
 +       GPRINT:queue5:MIN: \: %5.1lf (min)
 +       GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue6#ff9933:queue6
 +       GPRINT:queue6:LAST: \: %5.1lf (cur)
 +       GPRINT:queue6:MAX: \: %5.1lf (max)
 +       GPRINT:queue6:MIN: \: %5.1lf (min)
 +       GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:queue7#00ff00:queue7
 +       GPRINT:queue7:LAST: \: %5.1lf (cur)
 +       GPRINT:queue7:MAX: \: %5.1lf (max)
 +       GPRINT:queue7:MIN: \: %5.1lf (min)
 +       GPRINT:queue7:AVERAGE: \: %5.1lf (avg)
 +
 +[lsf_queues]
 +       DEF:queue1p=lsf_queues.rrd:queue1pending:AVERAGE
 +       DEF:queue2p=lsf_queues.rrd:queue2pending:AVERAGE
 +       DEF:queue3p=lsf_queues.rrd:queue3pending:AVERAGE
 +       DEF:queue4p=lsf_queues.rrd:queue4pending:AVERAGE
 +       DEF:queue5p=lsf_queues.rrd:queue5pending:AVERAGE
 +       DEF:queue6p=lsf_queues.rrd:queue6pending:AVERAGE
 +       DEF:queue7p=lsf_queues.rrd:queue7pending:AVERAGE
 +       DEF:queue8p=lsf_queues.rrd:queue8pending:AVERAGE
 +       DEF:queue1r=lsf_queues.rrd:queue1running:AVERAGE
 +       DEF:queue2r=lsf_queues.rrd:queue2running:AVERAGE
 +       DEF:queue3r=lsf_queues.rrd:queue3running:AVERAGE
 +       DEF:queue4r=lsf_queues.rrd:queue4running:AVERAGE
 +       DEF:queue5r=lsf_queues.rrd:queue5running:AVERAGE
 +       DEF:queue6r=lsf_queues.rrd:queue6running:AVERAGE
 +       DEF:queue7r=lsf_queues.rrd:queue7running:AVERAGE
 +       DEF:queue8r=lsf_queues.rrd:queue8running:AVERAGE
 +       DEF:queue1s=lsf_queues.rrd:queue1suspended:AVERAGE
 +       DEF:queue2s=lsf_queues.rrd:queue2suspended:AVERAGE
 +       DEF:queue3s=lsf_queues.rrd:queue3suspended:AVERAGE
 +       DEF:queue4s=lsf_queues.rrd:queue4suspended:AVERAGE
 +       DEF:queue5s=lsf_queues.rrd:queue5suspended:AVERAGE
 +       DEF:queue6s=lsf_queues.rrd:queue6suspended:AVERAGE
 +       DEF:queue7s=lsf_queues.rrd:queue7suspended:AVERAGE
 +       DEF:queue8s=lsf_queues.rrd:queue8suspended:AVERAGE
 +       DEF:queue8=lsf_queues.rrd:queue8running:AVERAGE
 +
 +CDEF:pending=queue1p,queue2p,+,queue3p,+,queue4p,+,queue5p,+,queue6p,+,queue7p,+,queue8p,+
 +
 +CDEF:running=queue1r,queue2r,+,queue3r,+,queue4r,+,queue5r,+,queue6r,+,queue7r,+,queue8r,+
 +
 +CDEF:suspended=queue1s,queue2s,+,queue3s,+,queue4s,+,queue5s,+,queue6s,+,queue7s,+,queue8s,+
 +       TITLE LSF queues
 +       -l 0
 +       YAXIS #
 +       AREA:running#cc99ff:running
 +       GPRINT:running:LAST: \: %5.1lf (cur)
 +       GPRINT:running:MAX: \: %5.1lf (max)
 +       GPRINT:running:MIN: \: %5.1lf (min)
 +       GPRINT:running:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:suspended#9933ff:suspended
 +       GPRINT:suspended:LAST: \: %5.1lf (cur)
 +       GPRINT:suspended:MAX: \: %5.1lf (max)
 +       GPRINT:suspended:MIN: \: %5.1lf (min)
 +       GPRINT:suspended:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:pending#99ccff:pending
 +       GPRINT:pending:LAST: \: %5.1lf (cur)
 +       GPRINT:pending:MAX: \: %5.1lf (max)
 +       GPRINT:pending:MIN: \: %5.1lf (min)
 +</code>
 +</hidden>
 +
 +===== Source =====
 +==== lsf_queues.pl ====
 +<hidden onHidden="Show Code ⇲" onVisible="Hide Code ⇱">
 +<code perl>
 +#!/usr/bin/perl -w
 +#
 +# client-side script to monitor queues on a lsf HPC farm
 +#
 +# copyright 2006 - 2007  Genome Research Limited / Gildas Le Nadan
 +# This script is released under the Gnu Public
 +# License (GPL) version 2 and Later
 +
 +my $version = 1.0.0;
 +
 +use strict;
 +
 +##### PARAMETERS YOU CAN TWEAK
 +
 +# you can debug this script in your environment by setting 1 below and running
 +# BB=echo BBDISP=127.0.0.1 ./lsf_mon.pl on one of your nodes
 +my $DEBUG = 0;  # 1 for debug, 0 otherwise
 +
 +# hobbit config
 +my $farm = "farm-login"; # the hostname you want to report under
 +my $hobbitcolumn = "lsf_queues"; # the column name for the test
 +my $color = "green"; # default color
 +my $summary = "LSF queues report"; # title of the test output
 +
 +# how the lsf commands must be run
 +my @bqueues=`. /software/noarch/lsf/conf/profile.lsf &&  /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/bqueues`;
 +
 +# lines to ignore in the output
 +my $bqueues_banner="QUEUE_NAME      PRIO STATUS          MAX JL/U JL/P JL/H NJOBS  PEND   RUN  SUSP";
 +
 +#### INTERNAL PARAMETERS (NO TWEAKING REQUIRED)
 +
 +my $bb = "";
 +my $bbdisp = "";
 +
 +my %hqueues = ();
 +
 +### FUNCTIONS
 +
 +sub print_summary {
 + my $result = "Queues\n";
 + foreach my $queue ( sort keys %hqueues ) {
 + $result .= sprintf( "%s pending  : %4u\n", $queue, $hqueues{$queue}{pend});
 + $result .= sprintf( "%s running  : %4u\n", $queue, $hqueues{$queue}{run});
 + $result .= sprintf( "%s suspended: %4u\n\n", $queue, $hqueues{$queue}{susp});
 + }
 + return $result;
 +}
 +
 +sub process_bqueues {
 + foreach my $line ( @bqueues ) {
 + unless ( $line =~ /$bqueues_banner/ ) {
 + my @fields = split( " ", $line );
 + $hqueues{$fields[0]}{pend} = $fields[8];
 + $hqueues{$fields[0]}{run} = $fields[9];
 + $hqueues{$fields[0]}{susp} = $fields[10];
 + }
 + }
 +}
 +
 +sub send_report {
 + my ( $statusmsg ) = @_;
 + # Build the command we use to send a status to the Xymon daemon
 + my $cmd = $bb . " " . $bbdisp . " \"status " . $farm . "." . $hobbitcolumn . " " . $color . " " . $summary . "\n\n" . $statusmsg . "\"";
 + # And send the message
 + system $cmd;
 +}
 +
 +#### MAIN
 +
 +# Get the BB and BBDISP environment settings.
 +$bb = $ENV{"BB"} || die "BB not defined";
 +$bbdisp = $ENV{"BBDISP"} || die "BBDISP not defined";
 +
 +my $statusmsg = "";
 +
 +process_bqueues;
 +$statusmsg .= print_summary;
 +send_report( $statusmsg );
 +</code>
 +</hidden>
 +
 +===== Known  Bugs and Issues =====
 +
 +===== To Do =====
 +
 +===== Credits =====
 +
 +===== Changelog =====
 +
 +  * **2007-04-08**
 +    * Initial release
  
  • monitors/lsf_queue.txt
  • Last modified: 2009/11/23 05:52
  • by 127.0.0.1