monitors:lsf_mon

no way to compare when less than two revisions

Differences

This shows you the differences between two versions of the page.


monitors:lsf_mon [2009/11/23 05:51] (current) – created - external edit 127.0.0.1
Line 1: Line 1:
 +====== lsf_mon ======
 +
 +^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] |
 +^ Compatibility | Xymon 4.2 |
 +^ Requirements | Perl, unix, Platform LSF |
 +^ Download | None |
 +^ Last Update | 2007-04-08 |
 +
 +===== Description =====
 +
 +This script allows monitoring of the hpc farm members.
 +
 +It displays the number of hosts with each state in a ncv-compatible manner, and gives a list of hosts with a status of "closed administratively" or "unavailable".
 +
 +You can also regroup statistics per "host classes" using regexp (see script source for more details).
 +
 +Here is a sample report
 +
 +<code>
 +LSF report
 +
 +Hosts by status
 +closed administratively:    4
 +closed busy            :    0
 +closed full            :  580
 +ok                     :    4
 +unavail                :    5
 +
 +Statistics per class (%)
 +CLASS    FULL    OK    BUSY    OTHER
 +bc-1    93    0    0    7
 +bc-10    100    0    0    0
 +bc-2    95    0    0    5
 +bc-3    90    0    0    10
 +bc-4    95    0    0    5
 +bc-5    97    0    0    3
 +bc-6    100    0    0    0
 +bc-7    100    0    0    0
 +bc-8    100    0    0    0
 +bc-9    100    0    0    0
 +pingu    0    100    0    0
 +turing    0    100    0    0
 +
 +Closed adm hosts:
 + bc-1-2-12 bc-2-2-01 bc-5-2-01 bc-5-2-02
 +
 +Unavail hosts:
 + bc-1-3-03 bc-2-1-05 bc-3-3-08 bc-3-3-12 bc-4-3-08 
 +</code>
 +===== Installation =====
 +=== Client side ===
 +  * Copy the script on a farm node in client's ext folder (e-g /usr/lib/hobbit/client/ext on linux/debian) with owner hobbit.hobbit and rights 0755
 +  * Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Example on a linux/debian host:<code>
 +# lsf stats
 +[lsf]
 +       ENVFILE /usr/lib/hobbit/client/etc/hobbitclient.cfg
 +       CMD /usr/lib/hobbit/client/ext/lsf_mon.pl
 +       INTERVAL 5m
 +</code>
 +
 +=== Server side ===
 +  * In hobbitserver.cfg:
 +    - Append "lsf=ncv" to TEST2RRD
 +    - Append "lsf" to GRAPHS
 +    - Add the following line for NCV:<code>
 +NCV_lsf="*:NONE,closedadministrativ:GAUGE,closedbusy:GAUGE,closedfull:GAUGE,ok:GAUGE,unavail:GAUGE"
 +</code>
 +  * You need an entry in one of the bb-hosts file:<code>
 +1.2.3.4     myfarm      # noconn TRENDS:*,lsf
 +</code>
 +  * Edit the hobbitgraph.cfg configuration to add:<code>
 +[lsf]
 +       DEF:admin=lsf.rrd:closedadministrativ:AVERAGE
 +       DEF:full=lsf.rrd:closedfull:AVERAGE
 +       DEF:busy=lsf.rrd:closedbusy:AVERAGE
 +       DEF:ok=lsf.rrd:ok:AVERAGE
 +       DEF:unavail=lsf.rrd:unavail:AVERAGE
 +       TITLE LSF status
 +       -l 0
 +       YAXIS #
 +       AREA:unavail#bbbbbb:unavailable
 +       GPRINT:unavail:LAST: \: %5.1lf (cur)
 +       GPRINT:unavail:MAX: \: %5.1lf (max)
 +       GPRINT:unavail:MIN: \: %5.1lf (min)
 +       GPRINT:unavail:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:busy#ff0000:closed busy
 +       GPRINT:busy:LAST: \: %5.1lf (cur)
 +       GPRINT:busy:MAX: \: %5.1lf (max)
 +       GPRINT:busy:MIN: \: %5.1lf (min)
 +       GPRINT:busy:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:admin#5555cc:closed admin.
 +       GPRINT:admin:LAST: \: %5.1lf (cur)
 +       GPRINT:admin:MAX: \: %5.1lf (max)
 +       GPRINT:admin:MIN: \: %5.1lf (min)
 +       GPRINT:admin:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:ok#ff9900:ok
 +       GPRINT:ok:LAST: \: %5.1lf (cur)
 +       GPRINT:ok:MAX: \: %5.1lf (max)
 +       GPRINT:ok:MIN: \: %5.1lf (min)
 +       GPRINT:ok:AVERAGE: \: %5.1lf (avg)\n
 +       STACK:full#33cc33:closed full
 +       GPRINT:full:LAST: \: %5.1lf (cur)
 +       GPRINT:full:MAX: \: %5.1lf (max)
 +       GPRINT:full:MIN: \: %5.1lf (min)
 +       GPRINT:full:AVERAGE: \: %5.1lf (avg)\n 
 +</code>
 +===== Source =====
 +==== lsf_mon.pl ====
 +<hidden onHidden="Show Code ⇲" onVisible="Hide Code ⇱">
 +<code perl>
 +#!/usr/bin/perl -w
 +#
 +# client-side script to monitor a lsf HPC farm
 +#
 +# copyright 2006 - 2007  Genome Research Limited / Gildas Le Nadan
 +# This script is released under the Gnu Public
 +# License (GPL) version 2 and Later
 +
 +my $version = 1.0.0;
 +
 +use strict;
 +
 +#### PARAMETERS YOU CAN TWEAK
 +
 +# you can debug this script in your environment by setting 1 below and running
 +# BB=echo BBDISP=127.0.0.1 ./lsf_mon.pl on one of your nodes
 +my $DEBUG = 0; # 1 for debug, 0 otherwise
 +
 +# how the lsf commands must be run
 +my @lsload=`. /software/noarch/lsf/conf/profile.lsf &&  /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/lsload`;
 +my @bhosts=`. /software/noarch/lsf/conf/profile.lsf &&  /usr/local/lsf/6.1/linux2.6-glibc2.3-amd64/bin/bhosts -w`;
 +
 +# lines to ignore in the outputs
 +my $bhosts_banner="HOST_NAME          STATUS          JL/U    MAX  NJOBS    RUN  SSUSP  USUSP    RSV";
 +my $lsload_banner="HOST_NAME       status  r15s   r1m  r15m   ut    pg  ls    it   tmp   swp   mem";
 +
 +# classes are a way to regroup the results for certain hosts that a part of a
 +# group/class. It uses a regexp to separate the classes, so your hostnames
 +# must be related to the class they are part of
 +my $USE_CLASSES = 1; # 1 if we use the concept of server classes, 0 otherwise
 +
 +# how to split our hostnames into a classes
 +my $class_regex = "^(pingu|turing|bc-[0-9]*)-*[0-9]*-*[0-9]*";
 +
 +# hobbit config
 +my $farm = "farm-login"; # the hostname you want to report under
 +my $hobbitcolumn = "lsf"; # the column name for the test
 +my $color = "green"; # default color
 +my $summary = "LSF report"; # title of the test output
 +
 +#### INTERNAL PARAMETERS (NO TWEAKING REQUIRED)
 +
 +my $bb = "";
 +my $bbdisp = "";
 +
 +my %hash = ();
 +my %class = ();
 +
 +my $admclosed_hosts = "";
 +my $busy_hosts = "";
 +my $unavail_hosts = "";
 +my $unreach_hosts = "";
 +my $non_efficient = "";
 +my $misc = "";
 +
 +my $closed_full_counter = 0;
 +my $closed_busy_counter = 0;
 +my $closed_adm_counter = 0;
 +my $ok_counter = 0;
 +my $unavail_counter = 0;
 +my $unreach_counter = 0;
 +
 +my $CL_ADM = "closed_Adm";
 +my $CL_BUSY = "closed_Busy";
 +my $CL_FULL = "closed_Full";
 +my $OK = "ok";
 +my $UNREACHABLE = "unreach";
 +my $UNAVAILABLE = "unavail";
 +
 +### FUNCTIONS
 +
 +sub add_to_class {
 + my ( $status, $host ) = @_;
 + if ( $host =~ qr/$class_regex/ ) {
 + my $my_class = $1;
 + if ( ! defined $class{$my_class}{$status} ) {
 + $class{$my_class}{$status} = 0;
 + }
 + $class{$my_class}{$status}++;
 + }
 +}
 +
 +sub add_to_closed_adm {
 + my ( $host ) = @_;
 + $closed_adm_counter++;
 + $admclosed_hosts .=  " $host";
 + add_to_class( $CL_ADM, $host) if $USE_CLASSES;
 +}
 +
 +sub add_to_busy {
 + my ( $host ) = @_;
 + $closed_busy_counter++;
 + $busy_hosts .= " $host";
 + add_to_class( $CL_BUSY, $host) if $USE_CLASSES;
 +}
 +
 +sub add_to_ok {
 + my ( $host ) = @_;
 + $ok_counter++;
 + add_to_class( $OK, $host) if $USE_CLASSES;
 +}
 +
 +sub add_to_full {
 + my ( $host ) = @_;
 + $closed_full_counter++;
 + add_to_class( $CL_FULL, $host) if $USE_CLASSES;
 +}
 +
 +sub add_to_unavail {
 + my ( $host ) = @_;
 + $unavail_counter++;
 + $unavail_hosts .= " $host";
 + add_to_class( $UNAVAILABLE, $host) if $USE_CLASSES;
 +}
 +
 +sub add_to_unreach {
 + my ( $host ) = @_;
 + $unreach_counter++;
 + $unreach_hosts .= " $host";
 + add_to_class( $UNREACHABLE, $host) if $USE_CLASSES;
 +}
 +
 +sub print_non_full_or_ok {
 + my $result = "";
 + if ( $admclosed_hosts ne "" ) {
 + $result .= "\nClosed adm hosts:\n".$admclosed_hosts."\n";
 + }
 + if ( $busy_hosts ne "" ) {
 + $result .= "\nBusy hosts:\n".$busy_hosts."\n";
 + }
 + if ( $unavail_hosts ne "" ) {
 + $result .= "\nUnavail hosts:\n".$unavail_hosts."\n";
 + }
 + return $result;
 +}
 +
 +sub percent {
 + my ( $value, $total ) = @_;
 + my $percent =  ( $value / $total ) * 100;
 + return sprintf( "%.0f", $percent);
 +
 +
 +sub print_classes {
 + my $result = "";
 + $result .= "\nStatistics per class (%)\nCLASS\tFULL\tOK\tBUSY\tOTHER\n";
 + foreach my $my_class ( sort keys %class ) {
 + # fully populate the variables we need
 + my @variables = ( $OK, $CL_FULL, $CL_BUSY, $CL_ADM, $UNAVAILABLE, $UNREACHABLE );
 + foreach my $v ( @variables ) {
 + unless ( defined( $class{$my_class}{$v} ) ) {
 + $class{$my_class}{$v} = 0;
 + }
 + }
 + my $others = $class{$my_class}{$CL_ADM}
 + + $class{$my_class}{$UNAVAILABLE}
 + + $class{$my_class}{$UNREACHABLE};
 +
 + my $total = $others + $class{$my_class}{$OK}
 + + $class{$my_class}{$CL_FULL}
 + + $class{$my_class}{$CL_BUSY};
 +
 + $result .= $my_class."\t";
 + $result .= percent( $class{$my_class}{$CL_FULL}, $total );
 + $result .= "\t";
 + $result .= percent( $class{$my_class}{$OK}, $total );
 + $result .= "\t";
 + $result .= percent( $class{$my_class}{$CL_BUSY}, $total );
 + $result .= "\t";
 + $result .= percent( $others, $total );
 + $result .= "\n";
 + }
 + return $result;
 +}
 +
 +sub print_summary {
 + my $result = "Hosts by status\n";
 + $result .= sprintf( "closed administratively: %4u\n", $closed_adm_counter);
 + $result .= sprintf( "closed busy            : %4u\n", $closed_busy_counter );
 + $result .= sprintf( "closed full            : %4u\n", $closed_full_counter );
 + $result .= sprintf( "ok                     : %4u\n", $ok_counter );
 + $result .= sprintf( "unavail                : %4u\n", $unavail_counter );
 + return $result;
 +}
 +
 +# a non efficient job is a job where there is a non cpu constraint, i-e that
 +# sits on the farm doing nothing or just about
 +# The way I used to measure it was not fine-grained enough so I removed this
 +# feature for the time being
 +#
 +#sub print_non_efficient {
 +# my $result = "";
 +# if ( $non_efficient ne "" ) {
 +# $result .= "\n&YELLOW Hosts running non efficient tasks:\n".$non_efficient."\n";
 +# }
 +# return $result;
 +#}
 +
 +sub print_misc_messages {
 + my $result = "";
 + if ( $misc ne "" ) {
 + $result .= "\nMisc:\n".$misc."\n";
 + }
 + return $result;
 +}
 +
 +sub process_lsload {
 + foreach my $line ( @lsload ) {
 + unless ( $line =~ /$lsload_banner/ ) {
 + my @fields = split( " ", $line );
 + $hash{$fields[0]}{cpu_usage} = $fields[5];
 + }
 + }
 +}
 +
 +sub process_bhosts {
 + foreach my $line ( @bhosts ) {
 + unless ( $line =~ /$bhosts_banner/ ) {
 + my @fields = split( " ", $line );
 + $hash{$fields[0]}{status} = $fields[1];
 + $hash{$fields[0]}{max_jobs} = $fields[3];
 + $hash{$fields[0]}{nb_jobs} = $fields[4];
 + $hash{$fields[0]}{run_jobs} = $fields[5];
 + $hash{$fields[0]}{sys_susp} = $fields[6];
 + }
 + }
 +}
 +
 +sub process_host {
 + my ( $host ) = @_;
 +
 + # add the host to a status class
 + if ( $hash{$host}{status} =~ qr/$CL_ADM/ ) {
 + add_to_closed_adm( $host );
 + }
 + elsif ( $hash{$host}{status} =~ qr/$CL_BUSY/ ) {
 + add_to_busy( $host );
 + }
 + elsif ( $hash{$host}{status} =~ qr/$CL_FULL/ ) {
 + add_to_full( $host );
 + }
 + elsif ( $hash{$host}{status} =~ qr/$OK/ ) {
 + add_to_ok( $host );
 + }
 + elsif ( $hash{$host}{status} =~ qr/$UNAVAILABLE/ ) {
 + add_to_unavail( $host );
 + }
 + elsif ( $hash{$host}{status} =~ qr/$UNREACHABLE/ ) {
 + add_to_unreach( $host );
 + }
 +}
 +
 +sub send_report {
 + my ( $statusmsg ) = @_;
 + if ( ( $non_efficient ne "" ) || ( $misc ne "" ) ) {
 + $color = "yellow";
 + }
 + # Build the command we use to send a status to the Xymon daemon
 + my $cmd = $bb . " " . $bbdisp . " \"status " . $farm . "." . $hobbitcolumn . " " . $color . " " . $summary . "\n\n" . $statusmsg . "\"";
 + # And send the message
 + system $cmd;
 +}
 +
 +#### MAIN
 +
 +# Get the BB and BBDISP environment settings.
 +$bb = $ENV{"BB"} || die "BB not defined";
 +$bbdisp = $ENV{"BBDISP"} || die "BBDISP not defined";
 +
 +my $statusmsg = "";
 +
 +# get the lsf reports
 +process_lsload;
 +process_bhosts;
 +
 +# get host status
 +foreach my $host ( sort keys %hash ) {
 + print "$host\n" if $DEBUG;
 + process_host( $host );
 +}
 +
 +$statusmsg .= print_summary;
 +$statusmsg .= print_classes if $USE_CLASSES;
 +$statusmsg .= print_non_full_or_ok;
 +#$statusmsg .= print_non_efficient;
 +$statusmsg .= print_misc_messages;
 +send_report( $statusmsg );
 +</code>
 +</hidden>
 +
 +===== Known  Bugs and Issues =====
 +
 +===== To Do =====
 +
 +===== Credits =====
 +
 +===== Changelog =====
 +
 +  * **2007-04-08**
 +    * Initial release
  
  • monitors/lsf_mon.txt
  • Last modified: 2009/11/23 05:51
  • by 127.0.0.1