====== lsf_mon ====== ^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] | ^ Compatibility | Xymon 4.2 | ^ Requirements | Perl, unix, Platform LSF | ^ Download | None | ^ Last Update | 2007-04-08 | ===== Description ===== This script allows monitoring of the hpc farm members. It displays the number of hosts with each state in a ncv-compatible manner, and gives a list of hosts with a status of "closed administratively" or "unavailable". You can also regroup statistics per "host classes" using regexp (see script source for more details). Here is a sample report LSF report Hosts by status closed administratively: 4 closed busy : 0 closed full : 580 ok : 4 unavail : 5 Statistics per class (%) CLASS FULL OK BUSY OTHER bc-1 93 0 0 7 bc-10 100 0 0 0 bc-2 95 0 0 5 bc-3 90 0 0 10 bc-4 95 0 0 5 bc-5 97 0 0 3 bc-6 100 0 0 0 bc-7 100 0 0 0 bc-8 100 0 0 0 bc-9 100 0 0 0 pingu 0 100 0 0 turing 0 100 0 0 Closed adm hosts: bc-1-2-12 bc-2-2-01 bc-5-2-01 bc-5-2-02 Unavail hosts: bc-1-3-03 bc-2-1-05 bc-3-3-08 bc-3-3-12 bc-4-3-08 ===== Installation ===== === Client side === * Copy the script on a farm node in client's ext folder (e-g /usr/lib/hobbit/client/ext on linux/debian) with owner hobbit.hobbit and rights 0755 * Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Example on a linux/debian host: # lsf stats [lsf] ENVFILE /usr/lib/hobbit/client/etc/hobbitclient.cfg CMD /usr/lib/hobbit/client/ext/lsf_mon.pl INTERVAL 5m === Server side === * In hobbitserver.cfg: - Append "lsf=ncv" to TEST2RRD - Append "lsf" to GRAPHS - Add the following line for NCV: NCV_lsf="*:NONE,closedadministrativ:GAUGE,closedbusy:GAUGE,closedfull:GAUGE,ok:GAUGE,unavail:GAUGE" * You need an entry in one of the bb-hosts file: 1.2.3.4 myfarm # noconn TRENDS:*,lsf * Edit the hobbitgraph.cfg configuration to add: [lsf] DEF:admin=lsf.rrd:closedadministrativ:AVERAGE DEF:full=lsf.rrd:closedfull:AVERAGE DEF:busy=lsf.rrd:closedbusy:AVERAGE DEF:ok=lsf.rrd:ok:AVERAGE DEF:unavail=lsf.rrd:unavail:AVERAGE TITLE LSF status -l 0 YAXIS # AREA:unavail#bbbbbb:unavailable GPRINT:unavail:LAST: \: %5.1lf (cur) GPRINT:unavail:MAX: \: %5.1lf (max) GPRINT:unavail:MIN: \: %5.1lf (min) GPRINT:unavail:AVERAGE: \: %5.1lf (avg)\n STACK:busy#ff0000:closed busy GPRINT:busy:LAST: \: %5.1lf (cur) GPRINT:busy:MAX: \: %5.1lf (max) GPRINT:busy:MIN: \: %5.1lf (min) GPRINT:busy:AVERAGE: \: %5.1lf (avg)\n STACK:admin#5555cc:closed admin. GPRINT:admin:LAST: \: %5.1lf (cur) GPRINT:admin:MAX: \: %5.1lf (max) GPRINT:admin:MIN: \: %5.1lf (min) GPRINT:admin:AVERAGE: \: %5.1lf (avg)\n STACK:ok#ff9900:ok GPRINT:ok:LAST: \: %5.1lf (cur) GPRINT:ok:MAX: \: %5.1lf (max) GPRINT:ok:MIN: \: %5.1lf (min) GPRINT:ok:AVERAGE: \: %5.1lf (avg)\n STACK:full#33cc33:closed full GPRINT:full:LAST: \: %5.1lf (cur) GPRINT:full:MAX: \: %5.1lf (max) GPRINT:full:MIN: \: %5.1lf (min) GPRINT:full:AVERAGE: \: %5.1lf (avg)\n ===== Source ===== ==== lsf_mon.pl ==== #!/usr/bin/perl -w # # client-side script to monitor a lsf HPC farm # # copyright 2006 - 2007 Genome Research Limited / Gildas Le Nadan # This script is released under the Gnu Public # License (GPL) version 2 and Later my $version = 1.0.0; use strict; #### PARAMETERS YOU CAN TWEAK # you can debug this script in your environment by setting 1 below and running # BB=echo BBDISP=127.0.0.1 ./lsf_mon.pl on one of your nodes my $DEBUG = 0; # 1 for debug, 0 otherwise # how the lsf commands must be run my @lsload=`. /software/noarch/lsf/conf/profile.lsf && /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/lsload`; my @bhosts=`. /software/noarch/lsf/conf/profile.lsf && /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/bhosts -w`; # lines to ignore in the outputs my $bhosts_banner="HOST_NAME STATUS JL/U MAX NJOBS RUN SSUSP USUSP RSV"; my $lsload_banner="HOST_NAME status r15s r1m r15m ut pg ls it tmp swp mem"; # classes are a way to regroup the results for certain hosts that a part of a # group/class. It uses a regexp to separate the classes, so your hostnames # must be related to the class they are part of my $USE_CLASSES = 1; # 1 if we use the concept of server classes, 0 otherwise # how to split our hostnames into a classes my $class_regex = "^(pingu|turing|bc-[0-9]*)-*[0-9]*-*[0-9]*"; # hobbit config my $farm = "farm-login"; # the hostname you want to report under my $hobbitcolumn = "lsf"; # the column name for the test my $color = "green"; # default color my $summary = "LSF report"; # title of the test output #### INTERNAL PARAMETERS (NO TWEAKING REQUIRED) my $bb = ""; my $bbdisp = ""; my %hash = (); my %class = (); my $admclosed_hosts = ""; my $busy_hosts = ""; my $unavail_hosts = ""; my $unreach_hosts = ""; my $non_efficient = ""; my $misc = ""; my $closed_full_counter = 0; my $closed_busy_counter = 0; my $closed_adm_counter = 0; my $ok_counter = 0; my $unavail_counter = 0; my $unreach_counter = 0; my $CL_ADM = "closed_Adm"; my $CL_BUSY = "closed_Busy"; my $CL_FULL = "closed_Full"; my $OK = "ok"; my $UNREACHABLE = "unreach"; my $UNAVAILABLE = "unavail"; ### FUNCTIONS sub add_to_class { my ( $status, $host ) = @_; if ( $host =~ qr/$class_regex/ ) { my $my_class = $1; if ( ! defined $class{$my_class}{$status} ) { $class{$my_class}{$status} = 0; } $class{$my_class}{$status}++; } } sub add_to_closed_adm { my ( $host ) = @_; $closed_adm_counter++; $admclosed_hosts .= " $host"; add_to_class( $CL_ADM, $host) if $USE_CLASSES; } sub add_to_busy { my ( $host ) = @_; $closed_busy_counter++; $busy_hosts .= " $host"; add_to_class( $CL_BUSY, $host) if $USE_CLASSES; } sub add_to_ok { my ( $host ) = @_; $ok_counter++; add_to_class( $OK, $host) if $USE_CLASSES; } sub add_to_full { my ( $host ) = @_; $closed_full_counter++; add_to_class( $CL_FULL, $host) if $USE_CLASSES; } sub add_to_unavail { my ( $host ) = @_; $unavail_counter++; $unavail_hosts .= " $host"; add_to_class( $UNAVAILABLE, $host) if $USE_CLASSES; } sub add_to_unreach { my ( $host ) = @_; $unreach_counter++; $unreach_hosts .= " $host"; add_to_class( $UNREACHABLE, $host) if $USE_CLASSES; } sub print_non_full_or_ok { my $result = ""; if ( $admclosed_hosts ne "" ) { $result .= "\nClosed adm hosts:\n".$admclosed_hosts."\n"; } if ( $busy_hosts ne "" ) { $result .= "\nBusy hosts:\n".$busy_hosts."\n"; } if ( $unavail_hosts ne "" ) { $result .= "\nUnavail hosts:\n".$unavail_hosts."\n"; } return $result; } sub percent { my ( $value, $total ) = @_; my $percent = ( $value / $total ) * 100; return sprintf( "%.0f", $percent); } sub print_classes { my $result = ""; $result .= "\nStatistics per class (%)\nCLASS\tFULL\tOK\tBUSY\tOTHER\n"; foreach my $my_class ( sort keys %class ) { # fully populate the variables we need my @variables = ( $OK, $CL_FULL, $CL_BUSY, $CL_ADM, $UNAVAILABLE, $UNREACHABLE ); foreach my $v ( @variables ) { unless ( defined( $class{$my_class}{$v} ) ) { $class{$my_class}{$v} = 0; } } my $others = $class{$my_class}{$CL_ADM} + $class{$my_class}{$UNAVAILABLE} + $class{$my_class}{$UNREACHABLE}; my $total = $others + $class{$my_class}{$OK} + $class{$my_class}{$CL_FULL} + $class{$my_class}{$CL_BUSY}; $result .= $my_class."\t"; $result .= percent( $class{$my_class}{$CL_FULL}, $total ); $result .= "\t"; $result .= percent( $class{$my_class}{$OK}, $total ); $result .= "\t"; $result .= percent( $class{$my_class}{$CL_BUSY}, $total ); $result .= "\t"; $result .= percent( $others, $total ); $result .= "\n"; } return $result; } sub print_summary { my $result = "Hosts by status\n"; $result .= sprintf( "closed administratively: %4u\n", $closed_adm_counter); $result .= sprintf( "closed busy : %4u\n", $closed_busy_counter ); $result .= sprintf( "closed full : %4u\n", $closed_full_counter ); $result .= sprintf( "ok : %4u\n", $ok_counter ); $result .= sprintf( "unavail : %4u\n", $unavail_counter ); return $result; } # a non efficient job is a job where there is a non cpu constraint, i-e that # sits on the farm doing nothing or just about # The way I used to measure it was not fine-grained enough so I removed this # feature for the time being # #sub print_non_efficient { # my $result = ""; # if ( $non_efficient ne "" ) { # $result .= "\n&YELLOW Hosts running non efficient tasks:\n".$non_efficient."\n"; # } # return $result; #} sub print_misc_messages { my $result = ""; if ( $misc ne "" ) { $result .= "\nMisc:\n".$misc."\n"; } return $result; } sub process_lsload { foreach my $line ( @lsload ) { unless ( $line =~ /$lsload_banner/ ) { my @fields = split( " ", $line ); $hash{$fields[0]}{cpu_usage} = $fields[5]; } } } sub process_bhosts { foreach my $line ( @bhosts ) { unless ( $line =~ /$bhosts_banner/ ) { my @fields = split( " ", $line ); $hash{$fields[0]}{status} = $fields[1]; $hash{$fields[0]}{max_jobs} = $fields[3]; $hash{$fields[0]}{nb_jobs} = $fields[4]; $hash{$fields[0]}{run_jobs} = $fields[5]; $hash{$fields[0]}{sys_susp} = $fields[6]; } } } sub process_host { my ( $host ) = @_; # add the host to a status class if ( $hash{$host}{status} =~ qr/$CL_ADM/ ) { add_to_closed_adm( $host ); } elsif ( $hash{$host}{status} =~ qr/$CL_BUSY/ ) { add_to_busy( $host ); } elsif ( $hash{$host}{status} =~ qr/$CL_FULL/ ) { add_to_full( $host ); } elsif ( $hash{$host}{status} =~ qr/$OK/ ) { add_to_ok( $host ); } elsif ( $hash{$host}{status} =~ qr/$UNAVAILABLE/ ) { add_to_unavail( $host ); } elsif ( $hash{$host}{status} =~ qr/$UNREACHABLE/ ) { add_to_unreach( $host ); } } sub send_report { my ( $statusmsg ) = @_; if ( ( $non_efficient ne "" ) || ( $misc ne "" ) ) { $color = "yellow"; } # Build the command we use to send a status to the Xymon daemon my $cmd = $bb . " " . $bbdisp . " \"status " . $farm . "." . $hobbitcolumn . " " . $color . " " . $summary . "\n\n" . $statusmsg . "\""; # And send the message system $cmd; } #### MAIN # Get the BB and BBDISP environment settings. $bb = $ENV{"BB"} || die "BB not defined"; $bbdisp = $ENV{"BBDISP"} || die "BBDISP not defined"; my $statusmsg = ""; # get the lsf reports process_lsload; process_bhosts; # get host status foreach my $host ( sort keys %hash ) { print "$host\n" if $DEBUG; process_host( $host ); } $statusmsg .= print_summary; $statusmsg .= print_classes if $USE_CLASSES; $statusmsg .= print_non_full_or_ok; #$statusmsg .= print_non_efficient; $statusmsg .= print_misc_messages; send_report( $statusmsg ); ===== Known Bugs and Issues ===== ===== To Do ===== ===== Credits ===== ===== Changelog ===== * **2007-04-08** * Initial release