lsf_queues

Author Gildas Le Nadan
Compatibility Xymon 4.2
Requirements Perl, unix, Platform LSF
Download None
Last Update 2007-04-08

This script is used to report the number of jobs in running/pending/suspended state for each queue in an ncv-compatible fashion, allowing queue graphing.

Client side

  • Copy the script on a farm node in client's ext folder (eg; /usr/lib/hobbit/client/ext on linux/debian) with owner hobbit.hobbit and rights 0755
  • Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Exemple on a linux/debian host:
    [lsfqueues]
           ENVFILE /usr/lib/hobbit/client/etc/hobbitclient.cfg
           CMD /usr/lib/hobbit/client/ext/lsf_queues.pl
           INTERVAL 5m

Server side

  • In hobbitserver.cfg:
    1. Append “lsf_queues=ncv” to TEST2RRD
    2. Append “lsf_queues” to GRAPHS
    3. Add the following line for NCV:
      NCV_lsf_queues="*:GAUGE"
  • You need an entry in one of the bb-hosts file:
    1.2.3.4     myfarm      # noconn TRENDS:*,lsf_queues:lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues

    or if you want to combine it with the lsf_mon script:

    1.2.3.4     myfarm      # noconn TRENDS:*,lsf,lsf_queues:lsf_queues|lsf_running_queues|lsf_suspended_queues|lsf_pending_queues
  • Edit the hobbitgraph.cfg configuration to add the following entries. Please note that I manually entered an entry for each queue I wanted to display. You will obviously need to change it to adapt it to your setup.

Show Code ⇲

Hide Code ⇱

[lsf_pending_queues]
       DEF:queue1=lsf_queues.rrd:queue1pending:AVERAGE
       DEF:queue2=lsf_queues.rrd:queue2pending:AVERAGE
       DEF:queue3=lsf_queues.rrd:queue3pending:AVERAGE
       DEF:queue4=lsf_queues.rrd:queue4pending:AVERAGE
       DEF:queue5=lsf_queues.rrd:queue5pending:AVERAGE
       DEF:queue6=lsf_queues.rrd:queue6pending:AVERAGE
       DEF:queue7=lsf_queues.rrd:queue7pending:AVERAGE
       DEF:queue8=lsf_queues.rrd:queue8pending:AVERAGE
       TITLE LSF queues: number of pending jobs per queue
       -l 0
       YAXIS #
       AREA:queue1#99ccff:queue1
       GPRINT:queue1:LAST: \: %5.1lf (cur)
       GPRINT:queue1:MAX: \: %5.1lf (max)
       GPRINT:queue1:MIN: \: %5.1lf (min)
       GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue2#cc99ff:queue2
       GPRINT:queue2:LAST: \: %5.1lf (cur)
       GPRINT:queue2:MAX: \: %5.1lf (max)
       GPRINT:queue2:MIN: \: %5.1lf (min)
       GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue3#9933ff:queue3
       GPRINT:queue3:LAST: \: %5.1lf (cur)
       GPRINT:queue3:MAX: \: %5.1lf (max)
       GPRINT:queue3:MIN: \: %5.1lf (min)
       GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue4#3399ff:queue4
       GPRINT:queue4:LAST: \: %5.1lf (cur)
       GPRINT:queue4:MAX: \: %5.1lf (max)
       GPRINT:queue4:MIN: \: %5.1lf (min)
       GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue5#ff3333:queue5
       GPRINT:queue5:LAST: \: %5.1lf (cur)
       GPRINT:queue5:MAX: \: %5.1lf (max)
       GPRINT:queue5:MIN: \: %5.1lf (min)
       GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue6#ff9933:queue6
       GPRINT:queue6:LAST: \: %5.1lf (cur)
       GPRINT:queue6:MAX: \: %5.1lf (max)
       GPRINT:queue6:MIN: \: %5.1lf (min)
       GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue7#00ff00:queue7
       GPRINT:queue7:LAST: \: %5.1lf (cur)
       GPRINT:queue7:MAX: \: %5.1lf (max)
       GPRINT:queue7:MIN: \: %5.1lf (min)
       GPRINT:queue7:AVERAGE: \: %5.1lf (avg)

[lsf_running_queues]
       DEF:queue1=lsf_queues.rrd:queue1running:AVERAGE
       DEF:queue2=lsf_queues.rrd:queue2running:AVERAGE
       DEF:queue3=lsf_queues.rrd:queue3running:AVERAGE
       DEF:queue4=lsf_queues.rrd:queue4running:AVERAGE
       DEF:queue5=lsf_queues.rrd:queue5running:AVERAGE
       DEF:queue6=lsf_queues.rrd:queue6running:AVERAGE
       DEF:queue7=lsf_queues.rrd:queue7running:AVERAGE
       DEF:queue8=lsf_queues.rrd:queue8running:AVERAGE
       TITLE LSF queues: number of running jobs per queue
       -l 0
       YAXIS #
       AREA:queue1#99ccff:queue1
       GPRINT:queue1:LAST: \: %5.1lf (cur)
       GPRINT:queue1:MAX: \: %5.1lf (max)
       GPRINT:queue1:MIN: \: %5.1lf (min)
       GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue2#cc99ff:queue2
       GPRINT:queue2:LAST: \: %5.1lf (cur)
       GPRINT:queue2:MAX: \: %5.1lf (max)
       GPRINT:queue2:MIN: \: %5.1lf (min)
       GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue3#9933ff:queue3
       GPRINT:queue3:LAST: \: %5.1lf (cur)
       GPRINT:queue3:MAX: \: %5.1lf (max)
       GPRINT:queue3:MIN: \: %5.1lf (min)
       GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue4#3399ff:queue4
       GPRINT:queue4:LAST: \: %5.1lf (cur)
       GPRINT:queue4:MAX: \: %5.1lf (max)
       GPRINT:queue4:MIN: \: %5.1lf (min)
       GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue5#ff3333:queue5
       GPRINT:queue5:LAST: \: %5.1lf (cur)
       GPRINT:queue5:MAX: \: %5.1lf (max)
       GPRINT:queue5:MIN: \: %5.1lf (min)
       GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue6#ff9933:queue6
       GPRINT:queue6:LAST: \: %5.1lf (cur)
       GPRINT:queue6:MAX: \: %5.1lf (max)
       GPRINT:queue6:MIN: \: %5.1lf (min)
       GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue7#00ff00:queue7
       GPRINT:queue7:LAST: \: %5.1lf (cur)
       GPRINT:queue7:MAX: \: %5.1lf (max)
       GPRINT:queue7:MIN: \: %5.1lf (min)
       GPRINT:queue7:AVERAGE: \: %5.1lf (avg)

[lsf_suspended_queues]
       DEF:queue1=lsf_queues.rrd:queue1suspended:AVERAGE
       DEF:queue2=lsf_queues.rrd:queue2suspended:AVERAGE
       DEF:queue3=lsf_queues.rrd:queue3suspended:AVERAGE
       DEF:queue4=lsf_queues.rrd:queue4suspended:AVERAGE
       DEF:queue5=lsf_queues.rrd:queue5suspended:AVERAGE
       DEF:queue6=lsf_queues.rrd:queue6suspended:AVERAGE
       DEF:queue7=lsf_queues.rrd:queue7suspended:AVERAGE
       DEF:queue8=lsf_queues.rrd:queue8suspended:AVERAGE
       TITLE LSF queues: number of suspended jobs per queue
       -l 0
       YAXIS #
       AREA:queue1#99ccff:queue1
       GPRINT:queue1:LAST: \: %5.1lf (cur)
       GPRINT:queue1:MAX: \: %5.1lf (max)
       GPRINT:queue1:MIN: \: %5.1lf (min)
       GPRINT:queue1:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue2#cc99ff:queue2
       GPRINT:queue2:LAST: \: %5.1lf (cur)
       GPRINT:queue2:MAX: \: %5.1lf (max)
       GPRINT:queue2:MIN: \: %5.1lf (min)
       GPRINT:queue2:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue3#9933ff:queue3
       GPRINT:queue3:LAST: \: %5.1lf (cur)
       GPRINT:queue3:MAX: \: %5.1lf (max)
       GPRINT:queue3:MIN: \: %5.1lf (min)
       GPRINT:queue3:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue4#3399ff:queue4
       GPRINT:queue4:LAST: \: %5.1lf (cur)
       GPRINT:queue4:MAX: \: %5.1lf (max)
       GPRINT:queue4:MIN: \: %5.1lf (min)
       GPRINT:queue4:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue5#ff3333:queue5
       GPRINT:queue5:LAST: \: %5.1lf (cur)
       GPRINT:queue5:MAX: \: %5.1lf (max)
       GPRINT:queue5:MIN: \: %5.1lf (min)
       GPRINT:queue5:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue6#ff9933:queue6
       GPRINT:queue6:LAST: \: %5.1lf (cur)
       GPRINT:queue6:MAX: \: %5.1lf (max)
       GPRINT:queue6:MIN: \: %5.1lf (min)
       GPRINT:queue6:AVERAGE: \: %5.1lf (avg)\n
       STACK:queue7#00ff00:queue7
       GPRINT:queue7:LAST: \: %5.1lf (cur)
       GPRINT:queue7:MAX: \: %5.1lf (max)
       GPRINT:queue7:MIN: \: %5.1lf (min)
       GPRINT:queue7:AVERAGE: \: %5.1lf (avg)

[lsf_queues]
       DEF:queue1p=lsf_queues.rrd:queue1pending:AVERAGE
       DEF:queue2p=lsf_queues.rrd:queue2pending:AVERAGE
       DEF:queue3p=lsf_queues.rrd:queue3pending:AVERAGE
       DEF:queue4p=lsf_queues.rrd:queue4pending:AVERAGE
       DEF:queue5p=lsf_queues.rrd:queue5pending:AVERAGE
       DEF:queue6p=lsf_queues.rrd:queue6pending:AVERAGE
       DEF:queue7p=lsf_queues.rrd:queue7pending:AVERAGE
       DEF:queue8p=lsf_queues.rrd:queue8pending:AVERAGE
       DEF:queue1r=lsf_queues.rrd:queue1running:AVERAGE
       DEF:queue2r=lsf_queues.rrd:queue2running:AVERAGE
       DEF:queue3r=lsf_queues.rrd:queue3running:AVERAGE
       DEF:queue4r=lsf_queues.rrd:queue4running:AVERAGE
       DEF:queue5r=lsf_queues.rrd:queue5running:AVERAGE
       DEF:queue6r=lsf_queues.rrd:queue6running:AVERAGE
       DEF:queue7r=lsf_queues.rrd:queue7running:AVERAGE
       DEF:queue8r=lsf_queues.rrd:queue8running:AVERAGE
       DEF:queue1s=lsf_queues.rrd:queue1suspended:AVERAGE
       DEF:queue2s=lsf_queues.rrd:queue2suspended:AVERAGE
       DEF:queue3s=lsf_queues.rrd:queue3suspended:AVERAGE
       DEF:queue4s=lsf_queues.rrd:queue4suspended:AVERAGE
       DEF:queue5s=lsf_queues.rrd:queue5suspended:AVERAGE
       DEF:queue6s=lsf_queues.rrd:queue6suspended:AVERAGE
       DEF:queue7s=lsf_queues.rrd:queue7suspended:AVERAGE
       DEF:queue8s=lsf_queues.rrd:queue8suspended:AVERAGE
       DEF:queue8=lsf_queues.rrd:queue8running:AVERAGE

CDEF:pending=queue1p,queue2p,+,queue3p,+,queue4p,+,queue5p,+,queue6p,+,queue7p,+,queue8p,+

CDEF:running=queue1r,queue2r,+,queue3r,+,queue4r,+,queue5r,+,queue6r,+,queue7r,+,queue8r,+

CDEF:suspended=queue1s,queue2s,+,queue3s,+,queue4s,+,queue5s,+,queue6s,+,queue7s,+,queue8s,+
       TITLE LSF queues
       -l 0
       YAXIS #
       AREA:running#cc99ff:running
       GPRINT:running:LAST: \: %5.1lf (cur)
       GPRINT:running:MAX: \: %5.1lf (max)
       GPRINT:running:MIN: \: %5.1lf (min)
       GPRINT:running:AVERAGE: \: %5.1lf (avg)\n
       STACK:suspended#9933ff:suspended
       GPRINT:suspended:LAST: \: %5.1lf (cur)
       GPRINT:suspended:MAX: \: %5.1lf (max)
       GPRINT:suspended:MIN: \: %5.1lf (min)
       GPRINT:suspended:AVERAGE: \: %5.1lf (avg)\n
       STACK:pending#99ccff:pending
       GPRINT:pending:LAST: \: %5.1lf (cur)
       GPRINT:pending:MAX: \: %5.1lf (max)
       GPRINT:pending:MIN: \: %5.1lf (min)

lsf_queues.pl

Show Code ⇲

Hide Code ⇱

#!/usr/bin/perl -w
#
# client-side script to monitor queues on a lsf HPC farm
#
# copyright 2006 - 2007  Genome Research Limited / Gildas Le Nadan
# This script is released under the Gnu Public
# License (GPL) version 2 and Later
 
my $version = 1.0.0;
 
use strict;
 
##### PARAMETERS YOU CAN TWEAK
 
# you can debug this script in your environment by setting 1 below and running
# BB=echo BBDISP=127.0.0.1 ./lsf_mon.pl on one of your nodes
my $DEBUG = 0;  # 1 for debug, 0 otherwise
 
# hobbit config
my $farm = "farm-login";		# the hostname you want to report under
my $hobbitcolumn = "lsf_queues";	# the column name for the test
my $color = "green";			# default color
my $summary = "LSF queues report";	# title of the test output
 
# how the lsf commands must be run
my @bqueues=`. /software/noarch/lsf/conf/profile.lsf &&  /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/bqueues`;
 
# lines to ignore in the output
my $bqueues_banner="QUEUE_NAME      PRIO STATUS          MAX JL/U JL/P JL/H NJOBS  PEND   RUN  SUSP";
 
#### INTERNAL PARAMETERS (NO TWEAKING REQUIRED)
 
my $bb = "";
my $bbdisp = "";
 
my %hqueues = ();
 
### FUNCTIONS
 
sub print_summary {
	my $result = "Queues\n";
	foreach my $queue ( sort keys %hqueues ) {
		$result .= sprintf( "%s pending  : %4u\n", $queue, $hqueues{$queue}{pend});
		$result .= sprintf( "%s running  : %4u\n", $queue, $hqueues{$queue}{run});
		$result .= sprintf( "%s suspended: %4u\n\n", $queue, $hqueues{$queue}{susp});
	}
	return $result;
}	
 
sub process_bqueues {
	foreach my $line ( @bqueues ) {
		unless ( $line =~ /$bqueues_banner/ ) {
			my @fields	= split( " ", $line );
			$hqueues{$fields[0]}{pend} = $fields[8];
			$hqueues{$fields[0]}{run} = $fields[9];
			$hqueues{$fields[0]}{susp} = $fields[10];
		}
	}
}
 
sub send_report {
	my ( $statusmsg ) = @_;
	# Build the command we use to send a status to the Xymon daemon
	my $cmd = $bb . " " . $bbdisp . " \"status " . $farm . "." . $hobbitcolumn . " " . $color . " " . $summary . "\n\n" . $statusmsg . "\"";
	# And send the message
	system $cmd;
}
 
#### MAIN
 
# Get the BB and BBDISP environment settings.
$bb = $ENV{"BB"} || die "BB not defined";
$bbdisp = $ENV{"BBDISP"} || die "BBDISP not defined";
 
my $statusmsg = "";
 
process_bqueues;
$statusmsg .= print_summary;
send_report( $statusmsg );
  • 2007-04-08
    • Initial release
  • monitors/lsf_queue.txt
  • Last modified: 2009/11/23 05:52
  • (external edit)