monitors:lsf_mon

lsf_mon

Author Gildas Le Nadan
Compatibility Xymon 4.2
Requirements Perl, unix, Platform LSF
Download None
Last Update 2007-04-08

This script allows monitoring of the hpc farm members.

It displays the number of hosts with each state in a ncv-compatible manner, and gives a list of hosts with a status of “closed administratively” or “unavailable”.

You can also regroup statistics per “host classes” using regexp (see script source for more details).

Here is a sample report

LSF report

Hosts by status
closed administratively:    4
closed busy            :    0
closed full            :  580
ok                     :    4
unavail                :    5

Statistics per class (%)
CLASS    FULL    OK    BUSY    OTHER
bc-1    93    0    0    7
bc-10    100    0    0    0
bc-2    95    0    0    5
bc-3    90    0    0    10
bc-4    95    0    0    5
bc-5    97    0    0    3
bc-6    100    0    0    0
bc-7    100    0    0    0
bc-8    100    0    0    0
bc-9    100    0    0    0
pingu    0    100    0    0
turing    0    100    0    0

Closed adm hosts:
 bc-1-2-12 bc-2-2-01 bc-5-2-01 bc-5-2-02

Unavail hosts:
 bc-1-3-03 bc-2-1-05 bc-3-3-08 bc-3-3-12 bc-4-3-08 

Client side

  • Copy the script on a farm node in client's ext folder (e-g /usr/lib/hobbit/client/ext on linux/debian) with owner hobbit.hobbit and rights 0755
  • Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Example on a linux/debian host:
    # lsf stats
    [lsf]
           ENVFILE /usr/lib/hobbit/client/etc/hobbitclient.cfg
           CMD /usr/lib/hobbit/client/ext/lsf_mon.pl
           INTERVAL 5m

Server side

  • In hobbitserver.cfg:
    1. Append “lsf=ncv” to TEST2RRD
    2. Append “lsf” to GRAPHS
    3. Add the following line for NCV:
      NCV_lsf="*:NONE,closedadministrativ:GAUGE,closedbusy:GAUGE,closedfull:GAUGE,ok:GAUGE,unavail:GAUGE"
  • You need an entry in one of the bb-hosts file:
    1.2.3.4     myfarm      # noconn TRENDS:*,lsf
  • Edit the hobbitgraph.cfg configuration to add:
    [lsf]
           DEF:admin=lsf.rrd:closedadministrativ:AVERAGE
           DEF:full=lsf.rrd:closedfull:AVERAGE
           DEF:busy=lsf.rrd:closedbusy:AVERAGE
           DEF:ok=lsf.rrd:ok:AVERAGE
           DEF:unavail=lsf.rrd:unavail:AVERAGE
           TITLE LSF status
           -l 0
           YAXIS #
           AREA:unavail#bbbbbb:unavailable
           GPRINT:unavail:LAST: \: %5.1lf (cur)
           GPRINT:unavail:MAX: \: %5.1lf (max)
           GPRINT:unavail:MIN: \: %5.1lf (min)
           GPRINT:unavail:AVERAGE: \: %5.1lf (avg)\n
           STACK:busy#ff0000:closed busy
           GPRINT:busy:LAST: \: %5.1lf (cur)
           GPRINT:busy:MAX: \: %5.1lf (max)
           GPRINT:busy:MIN: \: %5.1lf (min)
           GPRINT:busy:AVERAGE: \: %5.1lf (avg)\n
           STACK:admin#5555cc:closed admin.
           GPRINT:admin:LAST: \: %5.1lf (cur)
           GPRINT:admin:MAX: \: %5.1lf (max)
           GPRINT:admin:MIN: \: %5.1lf (min)
           GPRINT:admin:AVERAGE: \: %5.1lf (avg)\n
           STACK:ok#ff9900:ok
           GPRINT:ok:LAST: \: %5.1lf (cur)
           GPRINT:ok:MAX: \: %5.1lf (max)
           GPRINT:ok:MIN: \: %5.1lf (min)
           GPRINT:ok:AVERAGE: \: %5.1lf (avg)\n
           STACK:full#33cc33:closed full
           GPRINT:full:LAST: \: %5.1lf (cur)
           GPRINT:full:MAX: \: %5.1lf (max)
           GPRINT:full:MIN: \: %5.1lf (min)
           GPRINT:full:AVERAGE: \: %5.1lf (avg)\n 

Show Code ⇲

Hide Code ⇱

#!/usr/bin/perl -w
#
# client-side script to monitor a lsf HPC farm
#
# copyright 2006 - 2007  Genome Research Limited / Gildas Le Nadan
# This script is released under the Gnu Public
# License (GPL) version 2 and Later
 
my $version = 1.0.0;
 
use strict;
 
#### PARAMETERS YOU CAN TWEAK
 
# you can debug this script in your environment by setting 1 below and running
# BB=echo BBDISP=127.0.0.1 ./lsf_mon.pl on one of your nodes
my $DEBUG = 0;	# 1 for debug, 0 otherwise
 
# how the lsf commands must be run
my @lsload=`. /software/noarch/lsf/conf/profile.lsf &&  /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/lsload`;
my @bhosts=`. /software/noarch/lsf/conf/profile.lsf &&  /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/bhosts -w`;
 
# lines to ignore in the outputs
my $bhosts_banner="HOST_NAME          STATUS          JL/U    MAX  NJOBS    RUN  SSUSP  USUSP    RSV";
my $lsload_banner="HOST_NAME       status  r15s   r1m  r15m   ut    pg  ls    it   tmp   swp   mem";
 
# classes are a way to regroup the results for certain hosts that a part of a
# group/class. It uses a regexp to separate the classes, so your hostnames
# must be related to the class they are part of
my $USE_CLASSES = 1;	# 1 if we use the concept of server classes, 0 otherwise
 
# how to split our hostnames into a classes
my $class_regex = "^(pingu|turing|bc-[0-9]*)-*[0-9]*-*[0-9]*";
 
# hobbit config
my $farm = "farm-login";	# the hostname you want to report under
my $hobbitcolumn = "lsf";	# the column name for the test
my $color = "green";		# default color
my $summary = "LSF report";	# title of the test output
 
#### INTERNAL PARAMETERS (NO TWEAKING REQUIRED)
 
my $bb = "";
my $bbdisp = "";
 
my %hash = ();
my %class = ();
 
my $admclosed_hosts = "";
my $busy_hosts = "";
my $unavail_hosts = "";
my $unreach_hosts = "";
my $non_efficient = "";
my $misc = "";
 
my $closed_full_counter = 0;
my $closed_busy_counter = 0;
my $closed_adm_counter = 0;
my $ok_counter = 0;
my $unavail_counter = 0;
my $unreach_counter = 0;
 
my $CL_ADM	= "closed_Adm";
my $CL_BUSY	= "closed_Busy";
my $CL_FULL	= "closed_Full";
my $OK		= "ok";
my $UNREACHABLE	= "unreach";
my $UNAVAILABLE	= "unavail";
 
### FUNCTIONS
 
sub add_to_class {
	my ( $status, $host ) = @_;
	if ( $host =~ qr/$class_regex/ ) {
		my $my_class = $1;
		if ( ! defined $class{$my_class}{$status} ) {
			$class{$my_class}{$status} = 0;
		}
		$class{$my_class}{$status}++;
	}
}
 
sub add_to_closed_adm {
	my ( $host ) = @_;
	$closed_adm_counter++;
	$admclosed_hosts .=  " $host";
	add_to_class( $CL_ADM, $host) if $USE_CLASSES;
}
 
sub add_to_busy {
	my ( $host ) = @_;
	$closed_busy_counter++;
	$busy_hosts .= " $host";
	add_to_class( $CL_BUSY, $host) if $USE_CLASSES;
}
 
sub add_to_ok {
	my ( $host ) = @_;
	$ok_counter++;
	add_to_class( $OK, $host) if $USE_CLASSES;
}
 
sub add_to_full {
	my ( $host ) = @_;
	$closed_full_counter++;
	add_to_class( $CL_FULL, $host) if $USE_CLASSES;
}
 
sub add_to_unavail {
	my ( $host ) = @_;
	$unavail_counter++;
	$unavail_hosts .= " $host";
	add_to_class( $UNAVAILABLE, $host) if $USE_CLASSES;
}
 
sub add_to_unreach {
	my ( $host ) = @_;
	$unreach_counter++;
	$unreach_hosts .= " $host";
	add_to_class( $UNREACHABLE, $host) if $USE_CLASSES;
}
 
sub print_non_full_or_ok {
	my $result = "";
	if ( $admclosed_hosts ne "" ) {
		$result .= "\nClosed adm hosts:\n".$admclosed_hosts."\n";
	}
	if ( $busy_hosts ne "" ) {
		$result .= "\nBusy hosts:\n".$busy_hosts."\n";
	}
	if ( $unavail_hosts ne "" ) {
		$result .= "\nUnavail hosts:\n".$unavail_hosts."\n";
	}
	return $result;
}
 
sub percent {
	my ( $value, $total ) = @_;
	my $percent =  ( $value / $total ) * 100;
	return sprintf( "%.0f", $percent);
} 
 
sub print_classes {
	my $result = "";
	$result .= "\nStatistics per class (%)\nCLASS\tFULL\tOK\tBUSY\tOTHER\n";
	foreach my $my_class ( sort keys %class ) {
		# fully populate the variables we need
		my @variables = ( $OK, $CL_FULL, $CL_BUSY, $CL_ADM, $UNAVAILABLE, $UNREACHABLE );
		foreach my $v ( @variables ) {
			unless ( defined( $class{$my_class}{$v} ) ) {
				$class{$my_class}{$v} = 0;
			}
		}
		my $others = $class{$my_class}{$CL_ADM}
				+ $class{$my_class}{$UNAVAILABLE}
				+ $class{$my_class}{$UNREACHABLE};
 
		my $total = $others + $class{$my_class}{$OK}
				+ $class{$my_class}{$CL_FULL}
				+ $class{$my_class}{$CL_BUSY};
 
		$result .= $my_class."\t";
		$result .= percent( $class{$my_class}{$CL_FULL}, $total );
		$result .= "\t";
		$result .= percent( $class{$my_class}{$OK}, $total );
		$result .= "\t";
		$result .= percent( $class{$my_class}{$CL_BUSY}, $total );
		$result .= "\t";
		$result .= percent( $others, $total );
		$result .= "\n";
	}
	return $result;
}
 
sub print_summary {
	my $result = "Hosts by status\n";
	$result .= sprintf( "closed administratively: %4u\n", $closed_adm_counter);
	$result .= sprintf( "closed busy            : %4u\n", $closed_busy_counter );
	$result .= sprintf( "closed full            : %4u\n", $closed_full_counter );
	$result .= sprintf( "ok                     : %4u\n", $ok_counter );
	$result .= sprintf( "unavail                : %4u\n", $unavail_counter );
	return $result;
}	
 
# a non efficient job is a job where there is a non cpu constraint, i-e that
# sits on the farm doing nothing or just about
# The way I used to measure it was not fine-grained enough so I removed this
# feature for the time being
#
#sub print_non_efficient {
#	my $result = "";
#	if ( $non_efficient ne "" ) {
#		$result .= "\n&YELLOW Hosts running non efficient tasks:\n".$non_efficient."\n";
#	}
#	return $result;
#}
 
sub print_misc_messages {
	my $result = "";
	if ( $misc ne "" ) {
		$result .= "\nMisc:\n".$misc."\n";
	}
	return $result;
}
 
sub process_lsload {
	foreach my $line ( @lsload ) {
		unless ( $line =~ /$lsload_banner/ ) {
			my @fields	= split( " ", $line );
			$hash{$fields[0]}{cpu_usage} = $fields[5];
		}		
	}
}
 
sub process_bhosts {
	foreach my $line ( @bhosts ) {
		unless ( $line =~ /$bhosts_banner/ ) {
			my @fields	= split( " ", $line );
			$hash{$fields[0]}{status} = $fields[1];
			$hash{$fields[0]}{max_jobs} = $fields[3];
			$hash{$fields[0]}{nb_jobs} = $fields[4];
			$hash{$fields[0]}{run_jobs} = $fields[5];
			$hash{$fields[0]}{sys_susp} = $fields[6];
		}
	}
}
 
sub process_host {
	my ( $host ) = @_;
 
	# add the host to a status class
	if ( $hash{$host}{status} =~ qr/$CL_ADM/ ) {
		add_to_closed_adm( $host );
	}
	elsif ( $hash{$host}{status} =~ qr/$CL_BUSY/ ) {
		add_to_busy( $host );
	}
	elsif ( $hash{$host}{status} =~ qr/$CL_FULL/ ) {
		add_to_full( $host );
	}
	elsif ( $hash{$host}{status} =~ qr/$OK/ ) {
		add_to_ok( $host );
	}
	elsif ( $hash{$host}{status} =~ qr/$UNAVAILABLE/ ) {
		add_to_unavail( $host );
	}
	elsif ( $hash{$host}{status} =~ qr/$UNREACHABLE/ ) {
		add_to_unreach( $host );
	}
}
 
sub send_report {
	my ( $statusmsg ) = @_;
	if ( ( $non_efficient ne "" ) || ( $misc ne "" ) ) {
		$color = "yellow";
	}
	# Build the command we use to send a status to the Xymon daemon
	my $cmd = $bb . " " . $bbdisp . " \"status " . $farm . "." . $hobbitcolumn . " " . $color . " " . $summary . "\n\n" . $statusmsg . "\"";
	# And send the message
	system $cmd;
}
 
#### MAIN
 
# Get the BB and BBDISP environment settings.
$bb = $ENV{"BB"} || die "BB not defined";
$bbdisp = $ENV{"BBDISP"} || die "BBDISP not defined";
 
my $statusmsg = "";
 
# get the lsf reports
process_lsload;
process_bhosts;
 
# get host status
foreach my $host ( sort keys %hash ) {
	print "$host\n" if $DEBUG;
	process_host( $host );
}
 
$statusmsg .= print_summary;
$statusmsg .= print_classes if $USE_CLASSES;
$statusmsg .= print_non_full_or_ok;
#$statusmsg .= print_non_efficient;
$statusmsg .= print_misc_messages;
send_report( $statusmsg );
  • 2007-04-08
    • Initial release
  • monitors/lsf_mon.txt
  • Last modified: 2009/11/23 05:51
  • by 127.0.0.1