Differences

This shows you the differences between two versions of the page.

Link to this comparison view

monitors:lsf_mon [2009/11/23 05:51] (current)
Line 1: Line 1:
 +====== lsf_mon ======
 +
 +^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] |
 +^ Compatibility | Xymon 4.2 |
 +^ Requirements | Perl, unix, Platform LSF |
 +^ Download | None |
 +^ Last Update | 2007-04-08 |
 +
 +===== Description =====
 +
 +This script allows monitoring of the hpc farm members.
 +
 +It displays the number of hosts with each state in a ncv-compatible manner, and gives a list of hosts with a status of "​closed administratively"​ or "​unavailable"​.
 +
 +You can also regroup statistics per "host classes"​ using regexp (see script source for more details).
 +
 +Here is a sample report
 +
 +<​code>​
 +LSF report
 +
 +Hosts by status
 +closed administratively: ​   4
 +closed busy            :    0
 +closed full            :  580
 +ok                     : ​   4
 +unavail ​               :    5
 +
 +Statistics per class (%)
 +CLASS    FULL    OK    BUSY    OTHER
 +bc-1    93    0    0    7
 +bc-10    100    0    0    0
 +bc-2    95    0    0    5
 +bc-3    90    0    0    10
 +bc-4    95    0    0    5
 +bc-5    97    0    0    3
 +bc-6    100    0    0    0
 +bc-7    100    0    0    0
 +bc-8    100    0    0    0
 +bc-9    100    0    0    0
 +pingu    0    100    0    0
 +turing ​   0    100    0    0
 +
 +Closed adm hosts:
 + ​bc-1-2-12 bc-2-2-01 bc-5-2-01 bc-5-2-02
 +
 +Unavail hosts:
 + ​bc-1-3-03 bc-2-1-05 bc-3-3-08 bc-3-3-12 bc-4-3-08 ​
 +</​code>​
 +===== Installation =====
 +=== Client side ===
 +  * Copy the script on a farm node in client'​s ext folder (e-g /​usr/​lib/​hobbit/​client/​ext on linux/​debian) with owner hobbit.hobbit and rights 0755
 +  * Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Example on a linux/​debian host:<​code>​
 +# lsf stats
 +[lsf]
 +       ​ENVFILE /​usr/​lib/​hobbit/​client/​etc/​hobbitclient.cfg
 +       CMD /​usr/​lib/​hobbit/​client/​ext/​lsf_mon.pl
 +       ​INTERVAL 5m
 +</​code>​
 +
 +=== Server side ===
 +  * In hobbitserver.cfg:​
 +    - Append "​lsf=ncv"​ to TEST2RRD
 +    - Append "​lsf"​ to GRAPHS
 +    - Add the following line for NCV:<​code>​
 +NCV_lsf="​*:​NONE,​closedadministrativ:​GAUGE,​closedbusy:​GAUGE,​closedfull:​GAUGE,​ok:​GAUGE,​unavail:​GAUGE"​
 +</​code>​
 +  * You need an entry in one of the bb-hosts file:<​code>​
 +1.2.3.4 ​    ​myfarm ​     # noconn TRENDS:​*,​lsf
 +</​code>​
 +  * Edit the hobbitgraph.cfg configuration to add:<​code>​
 +[lsf]
 +       ​DEF:​admin=lsf.rrd:​closedadministrativ:​AVERAGE
 +       ​DEF:​full=lsf.rrd:​closedfull:​AVERAGE
 +       ​DEF:​busy=lsf.rrd:​closedbusy:​AVERAGE
 +       ​DEF:​ok=lsf.rrd:​ok:​AVERAGE
 +       ​DEF:​unavail=lsf.rrd:​unavail:​AVERAGE
 +       TITLE LSF status
 +       -l 0
 +       YAXIS #
 +       ​AREA:​unavail#​bbbbbb:​unavailable
 +       ​GPRINT:​unavail:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​unavail:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​unavail:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​unavail:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​busy#​ff0000:​closed busy
 +       ​GPRINT:​busy:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​busy:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​busy:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​busy:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​admin#​5555cc:​closed admin.
 +       ​GPRINT:​admin:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​admin:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​admin:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​admin:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​ok#​ff9900:​ok
 +       ​GPRINT:​ok:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​ok:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​ok:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​ok:​AVERAGE:​ \: %5.1lf (avg)\n
 +       ​STACK:​full#​33cc33:​closed full
 +       ​GPRINT:​full:​LAST:​ \: %5.1lf (cur)
 +       ​GPRINT:​full:​MAX:​ \: %5.1lf (max)
 +       ​GPRINT:​full:​MIN:​ \: %5.1lf (min)
 +       ​GPRINT:​full:​AVERAGE:​ \: %5.1lf (avg)\n ​
 +</​code>​
 +===== Source =====
 +==== lsf_mon.pl ====
 +<hidden onHidden="​Show Code ⇲" onVisible="​Hide Code ⇱">​
 +<code perl>
 +#​!/​usr/​bin/​perl -w
 +#
 +# client-side script to monitor a lsf HPC farm
 +#
 +# copyright 2006 - 2007  Genome Research Limited / Gildas Le Nadan
 +# This script is released under the Gnu Public
 +# License (GPL) version 2 and Later
 +
 +my $version = 1.0.0;
 +
 +use strict;
 +
 +#### PARAMETERS YOU CAN TWEAK
 +
 +# you can debug this script in your environment by setting 1 below and running
 +# BB=echo BBDISP=127.0.0.1 ./​lsf_mon.pl on one of your nodes
 +my $DEBUG = 0; # 1 for debug, 0 otherwise
 +
 +# how the lsf commands must be run
 +my @lsload=`. /​software/​noarch/​lsf/​conf/​profile.lsf && ​ /​usr/​local/​lsf/​6.1/​linux2.6-glibc2.3-amd64/​bin/​lsload`;​
 +my @bhosts=`. /​software/​noarch/​lsf/​conf/​profile.lsf && ​ /​usr/​local/​lsf/​6.1/​linux2.6-glibc2.3-amd64/​bin/​bhosts -w`;
 +
 +# lines to ignore in the outputs
 +my $bhosts_banner="​HOST_NAME ​         STATUS ​         JL/U    MAX  NJOBS    RUN  SSUSP  USUSP    RSV";
 +my $lsload_banner="​HOST_NAME ​      ​status ​ r15s   ​r1m ​ r15m   ​ut ​   pg  ls    it   ​tmp ​  ​swp ​  ​mem";​
 +
 +# classes are a way to regroup the results for certain hosts that a part of a
 +# group/​class. It uses a regexp to separate the classes, so your hostnames
 +# must be related to the class they are part of
 +my $USE_CLASSES = 1; # 1 if we use the concept of server classes, 0 otherwise
 +
 +# how to split our hostnames into a classes
 +my $class_regex = "​^(pingu|turing|bc-[0-9]*)-*[0-9]*-*[0-9]*";​
 +
 +# hobbit config
 +my $farm = "​farm-login";​ #​ the hostname you want to report under
 +my $hobbitcolumn = "​lsf";​ #​ the column name for the test
 +my $color = "​green";​ #​ default color
 +my $summary = "LSF report";​ #​ title of the test output
 +
 +#### INTERNAL PARAMETERS (NO TWEAKING REQUIRED)
 +
 +my $bb = "";​
 +my $bbdisp = "";​
 +
 +my %hash = ();
 +my %class = ();
 +
 +my $admclosed_hosts = "";​
 +my $busy_hosts = "";​
 +my $unavail_hosts = "";​
 +my $unreach_hosts = "";​
 +my $non_efficient = "";​
 +my $misc = "";​
 +
 +my $closed_full_counter = 0;
 +my $closed_busy_counter = 0;
 +my $closed_adm_counter = 0;
 +my $ok_counter = 0;
 +my $unavail_counter = 0;
 +my $unreach_counter = 0;
 +
 +my $CL_ADM = "​closed_Adm";​
 +my $CL_BUSY = "​closed_Busy";​
 +my $CL_FULL = "​closed_Full";​
 +my $OK = "​ok";​
 +my $UNREACHABLE = "​unreach";​
 +my $UNAVAILABLE = "​unavail";​
 +
 +### FUNCTIONS
 +
 +sub add_to_class {
 + my ( $status, $host ) = @_;
 + if ( $host =~ qr/​$class_regex/​ ) {
 + my $my_class = $1;
 + if ( ! defined $class{$my_class}{$status} ) {
 + $class{$my_class}{$status} = 0;
 + }
 + $class{$my_class}{$status}++;​
 + }
 +}
 +
 +sub add_to_closed_adm {
 + my ( $host ) = @_;
 + $closed_adm_counter++;​
 + $admclosed_hosts .=  " $host";​
 + add_to_class( $CL_ADM, $host) if $USE_CLASSES;​
 +}
 +
 +sub add_to_busy {
 + my ( $host ) = @_;
 + $closed_busy_counter++;​
 + $busy_hosts .= " $host";​
 + add_to_class( $CL_BUSY, $host) if $USE_CLASSES;​
 +}
 +
 +sub add_to_ok {
 + my ( $host ) = @_;
 + $ok_counter++;​
 + add_to_class( $OK, $host) if $USE_CLASSES;​
 +}
 +
 +sub add_to_full {
 + my ( $host ) = @_;
 + $closed_full_counter++;​
 + add_to_class( $CL_FULL, $host) if $USE_CLASSES;​
 +}
 +
 +sub add_to_unavail {
 + my ( $host ) = @_;
 + $unavail_counter++;​
 + $unavail_hosts .= " $host";​
 + add_to_class( $UNAVAILABLE,​ $host) if $USE_CLASSES;​
 +}
 +
 +sub add_to_unreach {
 + my ( $host ) = @_;
 + $unreach_counter++;​
 + $unreach_hosts .= " $host";​
 + add_to_class( $UNREACHABLE,​ $host) if $USE_CLASSES;​
 +}
 +
 +sub print_non_full_or_ok {
 + my $result = "";​
 + if ( $admclosed_hosts ne ""​ ) {
 + $result .= "​\nClosed adm hosts:​\n"​.$admclosed_hosts."​\n";​
 + }
 + if ( $busy_hosts ne ""​ ) {
 + $result .= "​\nBusy hosts:​\n"​.$busy_hosts."​\n";​
 + }
 + if ( $unavail_hosts ne ""​ ) {
 + $result .= "​\nUnavail hosts:​\n"​.$unavail_hosts."​\n";​
 + }
 + return $result;
 +}
 +
 +sub percent {
 + my ( $value, $total ) = @_;
 + my $percent =  ( $value / $total ) * 100;
 + return sprintf( "​%.0f",​ $percent);
 +
 +
 +sub print_classes {
 + my $result = "";​
 + $result .= "​\nStatistics per class (%)\nCLASS\tFULL\tOK\tBUSY\tOTHER\n";​
 + foreach my $my_class ( sort keys %class ) {
 + # fully populate the variables we need
 + my @variables = ( $OK, $CL_FULL, $CL_BUSY, $CL_ADM, $UNAVAILABLE,​ $UNREACHABLE );
 + foreach my $v ( @variables ) {
 + unless ( defined( $class{$my_class}{$v} ) ) {
 + $class{$my_class}{$v} = 0;
 + }
 + }
 + my $others = $class{$my_class}{$CL_ADM}
 + + $class{$my_class}{$UNAVAILABLE}
 + + $class{$my_class}{$UNREACHABLE};​
 +
 + my $total = $others + $class{$my_class}{$OK}
 + + $class{$my_class}{$CL_FULL}
 + + $class{$my_class}{$CL_BUSY};​
 +
 + $result .= $my_class."​\t";​
 + $result .= percent( $class{$my_class}{$CL_FULL},​ $total );
 + $result .= "​\t";​
 + $result .= percent( $class{$my_class}{$OK},​ $total );
 + $result .= "​\t";​
 + $result .= percent( $class{$my_class}{$CL_BUSY},​ $total );
 + $result .= "​\t";​
 + $result .= percent( $others, $total );
 + $result .= "​\n";​
 + }
 + return $result;
 +}
 +
 +sub print_summary {
 + my $result = "Hosts by status\n";​
 + $result .= sprintf( "​closed administratively:​ %4u\n",​ $closed_adm_counter);​
 + $result .= sprintf( "​closed busy            : %4u\n",​ $closed_busy_counter );
 + $result .= sprintf( "​closed full            : %4u\n",​ $closed_full_counter );
 + $result .= sprintf( "​ok ​                    : %4u\n",​ $ok_counter );
 + $result .= sprintf( "​unavail ​               : %4u\n",​ $unavail_counter );
 + return $result;
 +}
 +
 +# a non efficient job is a job where there is a non cpu constraint, i-e that
 +# sits on the farm doing nothing or just about
 +# The way I used to measure it was not fine-grained enough so I removed this
 +# feature for the time being
 +#
 +#sub print_non_efficient {
 +# my $result = "";​
 +# if ( $non_efficient ne ""​ ) {
 +# $result .= "​\n&​YELLOW Hosts running non efficient tasks:​\n"​.$non_efficient."​\n";​
 +# }
 +# return $result;
 +#}
 +
 +sub print_misc_messages {
 + my $result = "";​
 + if ( $misc ne ""​ ) {
 + $result .= "​\nMisc:​\n"​.$misc."​\n";​
 + }
 + return $result;
 +}
 +
 +sub process_lsload {
 + foreach my $line ( @lsload ) {
 + unless ( $line =~ /​$lsload_banner/​ ) {
 + my @fields = split( " ", $line );
 + $hash{$fields[0]}{cpu_usage} = $fields[5];
 + }
 + }
 +}
 +
 +sub process_bhosts {
 + foreach my $line ( @bhosts ) {
 + unless ( $line =~ /​$bhosts_banner/​ ) {
 + my @fields = split( " ", $line );
 + $hash{$fields[0]}{status} = $fields[1];
 + $hash{$fields[0]}{max_jobs} = $fields[3];
 + $hash{$fields[0]}{nb_jobs} = $fields[4];
 + $hash{$fields[0]}{run_jobs} = $fields[5];
 + $hash{$fields[0]}{sys_susp} = $fields[6];
 + }
 + }
 +}
 +
 +sub process_host {
 + my ( $host ) = @_;
 +
 + # add the host to a status class
 + if ( $hash{$host}{status} =~ qr/$CL_ADM/ ) {
 + add_to_closed_adm( $host );
 + }
 + elsif ( $hash{$host}{status} =~ qr/​$CL_BUSY/​ ) {
 + add_to_busy( $host );
 + }
 + elsif ( $hash{$host}{status} =~ qr/​$CL_FULL/​ ) {
 + add_to_full( $host );
 + }
 + elsif ( $hash{$host}{status} =~ qr/$OK/ ) {
 + add_to_ok( $host );
 + }
 + elsif ( $hash{$host}{status} =~ qr/​$UNAVAILABLE/​ ) {
 + add_to_unavail( $host );
 + }
 + elsif ( $hash{$host}{status} =~ qr/​$UNREACHABLE/​ ) {
 + add_to_unreach( $host );
 + }
 +}
 +
 +sub send_report {
 + my ( $statusmsg ) = @_;
 + if ( ( $non_efficient ne ""​ ) || ( $misc ne ""​ ) ) {
 + $color = "​yellow";​
 + }
 + # Build the command we use to send a status to the Xymon daemon
 + my $cmd = $bb . " " . $bbdisp . " \"​status " . $farm . "​."​ . $hobbitcolumn . " " . $color . " " . $summary . "​\n\n"​ . $statusmsg . "​\"";​
 + # And send the message
 + system $cmd;
 +}
 +
 +#### MAIN
 +
 +# Get the BB and BBDISP environment settings.
 +$bb = $ENV{"​BB"​} || die "BB not defined";​
 +$bbdisp = $ENV{"​BBDISP"​} || die "​BBDISP not defined";​
 +
 +my $statusmsg = "";​
 +
 +# get the lsf reports
 +process_lsload;​
 +process_bhosts;​
 +
 +# get host status
 +foreach my $host ( sort keys %hash ) {
 + print "​$host\n"​ if $DEBUG;
 + process_host( $host );
 +}
 +
 +$statusmsg .= print_summary;​
 +$statusmsg .= print_classes if $USE_CLASSES;​
 +$statusmsg .= print_non_full_or_ok;​
 +#$statusmsg .= print_non_efficient;​
 +$statusmsg .= print_misc_messages;​
 +send_report( $statusmsg );
 +</​code>​
 +</​hidden>​
 +
 +===== Known  Bugs and Issues =====
 +
 +===== To Do =====
 +
 +===== Credits =====
 +
 +===== Changelog =====
 +
 +  * **2007-04-08**
 +    * Initial release
  
  • monitors/lsf_mon.txt
  • Last modified: 2009/11/23 05:51
  • (external edit)