====== lsf_mon ======
^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] |
^ Compatibility | Xymon 4.2 |
^ Requirements | Perl, unix, Platform LSF |
^ Download | None |
^ Last Update | 2007-04-08 |
===== Description =====
This script allows monitoring of the hpc farm members.
It displays the number of hosts with each state in a ncv-compatible manner, and gives a list of hosts with a status of "closed administratively" or "unavailable".
You can also regroup statistics per "host classes" using regexp (see script source for more details).
Here is a sample report
LSF report
Hosts by status
closed administratively: 4
closed busy : 0
closed full : 580
ok : 4
unavail : 5
Statistics per class (%)
CLASS FULL OK BUSY OTHER
bc-1 93 0 0 7
bc-10 100 0 0 0
bc-2 95 0 0 5
bc-3 90 0 0 10
bc-4 95 0 0 5
bc-5 97 0 0 3
bc-6 100 0 0 0
bc-7 100 0 0 0
bc-8 100 0 0 0
bc-9 100 0 0 0
pingu 0 100 0 0
turing 0 100 0 0
Closed adm hosts:
bc-1-2-12 bc-2-2-01 bc-5-2-01 bc-5-2-02
Unavail hosts:
bc-1-3-03 bc-2-1-05 bc-3-3-08 bc-3-3-12 bc-4-3-08
===== Installation =====
=== Client side ===
* Copy the script on a farm node in client's ext folder (e-g /usr/lib/hobbit/client/ext on linux/debian) with owner hobbit.hobbit and rights 0755
* Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Example on a linux/debian host:
# lsf stats
[lsf]
ENVFILE /usr/lib/hobbit/client/etc/hobbitclient.cfg
CMD /usr/lib/hobbit/client/ext/lsf_mon.pl
INTERVAL 5m
=== Server side ===
* In hobbitserver.cfg:
- Append "lsf=ncv" to TEST2RRD
- Append "lsf" to GRAPHS
- Add the following line for NCV:
NCV_lsf="*:NONE,closedadministrativ:GAUGE,closedbusy:GAUGE,closedfull:GAUGE,ok:GAUGE,unavail:GAUGE"
* You need an entry in one of the bb-hosts file:
1.2.3.4 myfarm # noconn TRENDS:*,lsf
* Edit the hobbitgraph.cfg configuration to add:
[lsf]
DEF:admin=lsf.rrd:closedadministrativ:AVERAGE
DEF:full=lsf.rrd:closedfull:AVERAGE
DEF:busy=lsf.rrd:closedbusy:AVERAGE
DEF:ok=lsf.rrd:ok:AVERAGE
DEF:unavail=lsf.rrd:unavail:AVERAGE
TITLE LSF status
-l 0
YAXIS #
AREA:unavail#bbbbbb:unavailable
GPRINT:unavail:LAST: \: %5.1lf (cur)
GPRINT:unavail:MAX: \: %5.1lf (max)
GPRINT:unavail:MIN: \: %5.1lf (min)
GPRINT:unavail:AVERAGE: \: %5.1lf (avg)\n
STACK:busy#ff0000:closed busy
GPRINT:busy:LAST: \: %5.1lf (cur)
GPRINT:busy:MAX: \: %5.1lf (max)
GPRINT:busy:MIN: \: %5.1lf (min)
GPRINT:busy:AVERAGE: \: %5.1lf (avg)\n
STACK:admin#5555cc:closed admin.
GPRINT:admin:LAST: \: %5.1lf (cur)
GPRINT:admin:MAX: \: %5.1lf (max)
GPRINT:admin:MIN: \: %5.1lf (min)
GPRINT:admin:AVERAGE: \: %5.1lf (avg)\n
STACK:ok#ff9900:ok
GPRINT:ok:LAST: \: %5.1lf (cur)
GPRINT:ok:MAX: \: %5.1lf (max)
GPRINT:ok:MIN: \: %5.1lf (min)
GPRINT:ok:AVERAGE: \: %5.1lf (avg)\n
STACK:full#33cc33:closed full
GPRINT:full:LAST: \: %5.1lf (cur)
GPRINT:full:MAX: \: %5.1lf (max)
GPRINT:full:MIN: \: %5.1lf (min)
GPRINT:full:AVERAGE: \: %5.1lf (avg)\n
===== Source =====
==== lsf_mon.pl ====
#!/usr/bin/perl -w
#
# client-side script to monitor a lsf HPC farm
#
# copyright 2006 - 2007 Genome Research Limited / Gildas Le Nadan
# This script is released under the Gnu Public
# License (GPL) version 2 and Later
my $version = 1.0.0;
use strict;
#### PARAMETERS YOU CAN TWEAK
# you can debug this script in your environment by setting 1 below and running
# BB=echo BBDISP=127.0.0.1 ./lsf_mon.pl on one of your nodes
my $DEBUG = 0; # 1 for debug, 0 otherwise
# how the lsf commands must be run
my @lsload=`. /software/noarch/lsf/conf/profile.lsf && /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/lsload`;
my @bhosts=`. /software/noarch/lsf/conf/profile.lsf && /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/bhosts -w`;
# lines to ignore in the outputs
my $bhosts_banner="HOST_NAME STATUS JL/U MAX NJOBS RUN SSUSP USUSP RSV";
my $lsload_banner="HOST_NAME status r15s r1m r15m ut pg ls it tmp swp mem";
# classes are a way to regroup the results for certain hosts that a part of a
# group/class. It uses a regexp to separate the classes, so your hostnames
# must be related to the class they are part of
my $USE_CLASSES = 1; # 1 if we use the concept of server classes, 0 otherwise
# how to split our hostnames into a classes
my $class_regex = "^(pingu|turing|bc-[0-9]*)-*[0-9]*-*[0-9]*";
# hobbit config
my $farm = "farm-login"; # the hostname you want to report under
my $hobbitcolumn = "lsf"; # the column name for the test
my $color = "green"; # default color
my $summary = "LSF report"; # title of the test output
#### INTERNAL PARAMETERS (NO TWEAKING REQUIRED)
my $bb = "";
my $bbdisp = "";
my %hash = ();
my %class = ();
my $admclosed_hosts = "";
my $busy_hosts = "";
my $unavail_hosts = "";
my $unreach_hosts = "";
my $non_efficient = "";
my $misc = "";
my $closed_full_counter = 0;
my $closed_busy_counter = 0;
my $closed_adm_counter = 0;
my $ok_counter = 0;
my $unavail_counter = 0;
my $unreach_counter = 0;
my $CL_ADM = "closed_Adm";
my $CL_BUSY = "closed_Busy";
my $CL_FULL = "closed_Full";
my $OK = "ok";
my $UNREACHABLE = "unreach";
my $UNAVAILABLE = "unavail";
### FUNCTIONS
sub add_to_class {
my ( $status, $host ) = @_;
if ( $host =~ qr/$class_regex/ ) {
my $my_class = $1;
if ( ! defined $class{$my_class}{$status} ) {
$class{$my_class}{$status} = 0;
}
$class{$my_class}{$status}++;
}
}
sub add_to_closed_adm {
my ( $host ) = @_;
$closed_adm_counter++;
$admclosed_hosts .= " $host";
add_to_class( $CL_ADM, $host) if $USE_CLASSES;
}
sub add_to_busy {
my ( $host ) = @_;
$closed_busy_counter++;
$busy_hosts .= " $host";
add_to_class( $CL_BUSY, $host) if $USE_CLASSES;
}
sub add_to_ok {
my ( $host ) = @_;
$ok_counter++;
add_to_class( $OK, $host) if $USE_CLASSES;
}
sub add_to_full {
my ( $host ) = @_;
$closed_full_counter++;
add_to_class( $CL_FULL, $host) if $USE_CLASSES;
}
sub add_to_unavail {
my ( $host ) = @_;
$unavail_counter++;
$unavail_hosts .= " $host";
add_to_class( $UNAVAILABLE, $host) if $USE_CLASSES;
}
sub add_to_unreach {
my ( $host ) = @_;
$unreach_counter++;
$unreach_hosts .= " $host";
add_to_class( $UNREACHABLE, $host) if $USE_CLASSES;
}
sub print_non_full_or_ok {
my $result = "";
if ( $admclosed_hosts ne "" ) {
$result .= "\nClosed adm hosts:\n".$admclosed_hosts."\n";
}
if ( $busy_hosts ne "" ) {
$result .= "\nBusy hosts:\n".$busy_hosts."\n";
}
if ( $unavail_hosts ne "" ) {
$result .= "\nUnavail hosts:\n".$unavail_hosts."\n";
}
return $result;
}
sub percent {
my ( $value, $total ) = @_;
my $percent = ( $value / $total ) * 100;
return sprintf( "%.0f", $percent);
}
sub print_classes {
my $result = "";
$result .= "\nStatistics per class (%)\nCLASS\tFULL\tOK\tBUSY\tOTHER\n";
foreach my $my_class ( sort keys %class ) {
# fully populate the variables we need
my @variables = ( $OK, $CL_FULL, $CL_BUSY, $CL_ADM, $UNAVAILABLE, $UNREACHABLE );
foreach my $v ( @variables ) {
unless ( defined( $class{$my_class}{$v} ) ) {
$class{$my_class}{$v} = 0;
}
}
my $others = $class{$my_class}{$CL_ADM}
+ $class{$my_class}{$UNAVAILABLE}
+ $class{$my_class}{$UNREACHABLE};
my $total = $others + $class{$my_class}{$OK}
+ $class{$my_class}{$CL_FULL}
+ $class{$my_class}{$CL_BUSY};
$result .= $my_class."\t";
$result .= percent( $class{$my_class}{$CL_FULL}, $total );
$result .= "\t";
$result .= percent( $class{$my_class}{$OK}, $total );
$result .= "\t";
$result .= percent( $class{$my_class}{$CL_BUSY}, $total );
$result .= "\t";
$result .= percent( $others, $total );
$result .= "\n";
}
return $result;
}
sub print_summary {
my $result = "Hosts by status\n";
$result .= sprintf( "closed administratively: %4u\n", $closed_adm_counter);
$result .= sprintf( "closed busy : %4u\n", $closed_busy_counter );
$result .= sprintf( "closed full : %4u\n", $closed_full_counter );
$result .= sprintf( "ok : %4u\n", $ok_counter );
$result .= sprintf( "unavail : %4u\n", $unavail_counter );
return $result;
}
# a non efficient job is a job where there is a non cpu constraint, i-e that
# sits on the farm doing nothing or just about
# The way I used to measure it was not fine-grained enough so I removed this
# feature for the time being
#
#sub print_non_efficient {
# my $result = "";
# if ( $non_efficient ne "" ) {
# $result .= "\n&YELLOW Hosts running non efficient tasks:\n".$non_efficient."\n";
# }
# return $result;
#}
sub print_misc_messages {
my $result = "";
if ( $misc ne "" ) {
$result .= "\nMisc:\n".$misc."\n";
}
return $result;
}
sub process_lsload {
foreach my $line ( @lsload ) {
unless ( $line =~ /$lsload_banner/ ) {
my @fields = split( " ", $line );
$hash{$fields[0]}{cpu_usage} = $fields[5];
}
}
}
sub process_bhosts {
foreach my $line ( @bhosts ) {
unless ( $line =~ /$bhosts_banner/ ) {
my @fields = split( " ", $line );
$hash{$fields[0]}{status} = $fields[1];
$hash{$fields[0]}{max_jobs} = $fields[3];
$hash{$fields[0]}{nb_jobs} = $fields[4];
$hash{$fields[0]}{run_jobs} = $fields[5];
$hash{$fields[0]}{sys_susp} = $fields[6];
}
}
}
sub process_host {
my ( $host ) = @_;
# add the host to a status class
if ( $hash{$host}{status} =~ qr/$CL_ADM/ ) {
add_to_closed_adm( $host );
}
elsif ( $hash{$host}{status} =~ qr/$CL_BUSY/ ) {
add_to_busy( $host );
}
elsif ( $hash{$host}{status} =~ qr/$CL_FULL/ ) {
add_to_full( $host );
}
elsif ( $hash{$host}{status} =~ qr/$OK/ ) {
add_to_ok( $host );
}
elsif ( $hash{$host}{status} =~ qr/$UNAVAILABLE/ ) {
add_to_unavail( $host );
}
elsif ( $hash{$host}{status} =~ qr/$UNREACHABLE/ ) {
add_to_unreach( $host );
}
}
sub send_report {
my ( $statusmsg ) = @_;
if ( ( $non_efficient ne "" ) || ( $misc ne "" ) ) {
$color = "yellow";
}
# Build the command we use to send a status to the Xymon daemon
my $cmd = $bb . " " . $bbdisp . " \"status " . $farm . "." . $hobbitcolumn . " " . $color . " " . $summary . "\n\n" . $statusmsg . "\"";
# And send the message
system $cmd;
}
#### MAIN
# Get the BB and BBDISP environment settings.
$bb = $ENV{"BB"} || die "BB not defined";
$bbdisp = $ENV{"BBDISP"} || die "BBDISP not defined";
my $statusmsg = "";
# get the lsf reports
process_lsload;
process_bhosts;
# get host status
foreach my $host ( sort keys %hash ) {
print "$host\n" if $DEBUG;
process_host( $host );
}
$statusmsg .= print_summary;
$statusmsg .= print_classes if $USE_CLASSES;
$statusmsg .= print_non_full_or_ok;
#$statusmsg .= print_non_efficient;
$statusmsg .= print_misc_messages;
send_report( $statusmsg );
===== Known Bugs and Issues =====
===== To Do =====
===== Credits =====
===== Changelog =====
* **2007-04-08**
* Initial release