no way to compare when less than two revisions
Differences
This shows you the differences between two versions of the page.
— | monitors:lsf_mon [2009/11/23 05:51] (current) – created - external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== lsf_mon ====== | ||
+ | |||
+ | ^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] | | ||
+ | ^ Compatibility | Xymon 4.2 | | ||
+ | ^ Requirements | Perl, unix, Platform LSF | | ||
+ | ^ Download | None | | ||
+ | ^ Last Update | 2007-04-08 | | ||
+ | |||
+ | ===== Description ===== | ||
+ | |||
+ | This script allows monitoring of the hpc farm members. | ||
+ | |||
+ | It displays the number of hosts with each state in a ncv-compatible manner, and gives a list of hosts with a status of " | ||
+ | |||
+ | You can also regroup statistics per "host classes" | ||
+ | |||
+ | Here is a sample report | ||
+ | |||
+ | < | ||
+ | LSF report | ||
+ | |||
+ | Hosts by status | ||
+ | closed administratively: | ||
+ | closed busy : 0 | ||
+ | closed full : 580 | ||
+ | ok : | ||
+ | unavail | ||
+ | |||
+ | Statistics per class (%) | ||
+ | CLASS FULL OK BUSY OTHER | ||
+ | bc-1 93 0 0 7 | ||
+ | bc-10 100 0 0 0 | ||
+ | bc-2 95 0 0 5 | ||
+ | bc-3 90 0 0 10 | ||
+ | bc-4 95 0 0 5 | ||
+ | bc-5 97 0 0 3 | ||
+ | bc-6 100 0 0 0 | ||
+ | bc-7 100 0 0 0 | ||
+ | bc-8 100 0 0 0 | ||
+ | bc-9 100 0 0 0 | ||
+ | pingu 0 100 0 0 | ||
+ | turing | ||
+ | |||
+ | Closed adm hosts: | ||
+ | | ||
+ | |||
+ | Unavail hosts: | ||
+ | | ||
+ | </ | ||
+ | ===== Installation ===== | ||
+ | === Client side === | ||
+ | * Copy the script on a farm node in client' | ||
+ | * Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Example on a linux/ | ||
+ | # lsf stats | ||
+ | [lsf] | ||
+ | | ||
+ | CMD / | ||
+ | | ||
+ | </ | ||
+ | |||
+ | === Server side === | ||
+ | * In hobbitserver.cfg: | ||
+ | - Append " | ||
+ | - Append " | ||
+ | - Add the following line for NCV:< | ||
+ | NCV_lsf=" | ||
+ | </ | ||
+ | * You need an entry in one of the bb-hosts file:< | ||
+ | 1.2.3.4 | ||
+ | </ | ||
+ | * Edit the hobbitgraph.cfg configuration to add:< | ||
+ | [lsf] | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | TITLE LSF status | ||
+ | -l 0 | ||
+ | YAXIS # | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | </ | ||
+ | ===== Source ===== | ||
+ | ==== lsf_mon.pl ==== | ||
+ | <hidden onHidden=" | ||
+ | <code perl> | ||
+ | # | ||
+ | # | ||
+ | # client-side script to monitor a lsf HPC farm | ||
+ | # | ||
+ | # copyright 2006 - 2007 Genome Research Limited / Gildas Le Nadan | ||
+ | # This script is released under the Gnu Public | ||
+ | # License (GPL) version 2 and Later | ||
+ | |||
+ | my $version = 1.0.0; | ||
+ | |||
+ | use strict; | ||
+ | |||
+ | #### PARAMETERS YOU CAN TWEAK | ||
+ | |||
+ | # you can debug this script in your environment by setting 1 below and running | ||
+ | # BB=echo BBDISP=127.0.0.1 ./ | ||
+ | my $DEBUG = 0; # 1 for debug, 0 otherwise | ||
+ | |||
+ | # how the lsf commands must be run | ||
+ | my @lsload=`. / | ||
+ | my @bhosts=`. / | ||
+ | |||
+ | # lines to ignore in the outputs | ||
+ | my $bhosts_banner=" | ||
+ | my $lsload_banner=" | ||
+ | |||
+ | # classes are a way to regroup the results for certain hosts that a part of a | ||
+ | # group/ | ||
+ | # must be related to the class they are part of | ||
+ | my $USE_CLASSES = 1; # 1 if we use the concept of server classes, 0 otherwise | ||
+ | |||
+ | # how to split our hostnames into a classes | ||
+ | my $class_regex = " | ||
+ | |||
+ | # hobbit config | ||
+ | my $farm = " | ||
+ | my $hobbitcolumn = " | ||
+ | my $color = " | ||
+ | my $summary = "LSF report"; | ||
+ | |||
+ | #### INTERNAL PARAMETERS (NO TWEAKING REQUIRED) | ||
+ | |||
+ | my $bb = ""; | ||
+ | my $bbdisp = ""; | ||
+ | |||
+ | my %hash = (); | ||
+ | my %class = (); | ||
+ | |||
+ | my $admclosed_hosts = ""; | ||
+ | my $busy_hosts = ""; | ||
+ | my $unavail_hosts = ""; | ||
+ | my $unreach_hosts = ""; | ||
+ | my $non_efficient = ""; | ||
+ | my $misc = ""; | ||
+ | |||
+ | my $closed_full_counter = 0; | ||
+ | my $closed_busy_counter = 0; | ||
+ | my $closed_adm_counter = 0; | ||
+ | my $ok_counter = 0; | ||
+ | my $unavail_counter = 0; | ||
+ | my $unreach_counter = 0; | ||
+ | |||
+ | my $CL_ADM = " | ||
+ | my $CL_BUSY = " | ||
+ | my $CL_FULL = " | ||
+ | my $OK = " | ||
+ | my $UNREACHABLE = " | ||
+ | my $UNAVAILABLE = " | ||
+ | |||
+ | ### FUNCTIONS | ||
+ | |||
+ | sub add_to_class { | ||
+ | my ( $status, $host ) = @_; | ||
+ | if ( $host =~ qr/ | ||
+ | my $my_class = $1; | ||
+ | if ( ! defined $class{$my_class}{$status} ) { | ||
+ | $class{$my_class}{$status} = 0; | ||
+ | } | ||
+ | $class{$my_class}{$status}++; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | sub add_to_closed_adm { | ||
+ | my ( $host ) = @_; | ||
+ | $closed_adm_counter++; | ||
+ | $admclosed_hosts .= " $host"; | ||
+ | add_to_class( $CL_ADM, $host) if $USE_CLASSES; | ||
+ | } | ||
+ | |||
+ | sub add_to_busy { | ||
+ | my ( $host ) = @_; | ||
+ | $closed_busy_counter++; | ||
+ | $busy_hosts .= " $host"; | ||
+ | add_to_class( $CL_BUSY, $host) if $USE_CLASSES; | ||
+ | } | ||
+ | |||
+ | sub add_to_ok { | ||
+ | my ( $host ) = @_; | ||
+ | $ok_counter++; | ||
+ | add_to_class( $OK, $host) if $USE_CLASSES; | ||
+ | } | ||
+ | |||
+ | sub add_to_full { | ||
+ | my ( $host ) = @_; | ||
+ | $closed_full_counter++; | ||
+ | add_to_class( $CL_FULL, $host) if $USE_CLASSES; | ||
+ | } | ||
+ | |||
+ | sub add_to_unavail { | ||
+ | my ( $host ) = @_; | ||
+ | $unavail_counter++; | ||
+ | $unavail_hosts .= " $host"; | ||
+ | add_to_class( $UNAVAILABLE, | ||
+ | } | ||
+ | |||
+ | sub add_to_unreach { | ||
+ | my ( $host ) = @_; | ||
+ | $unreach_counter++; | ||
+ | $unreach_hosts .= " $host"; | ||
+ | add_to_class( $UNREACHABLE, | ||
+ | } | ||
+ | |||
+ | sub print_non_full_or_ok { | ||
+ | my $result = ""; | ||
+ | if ( $admclosed_hosts ne "" | ||
+ | $result .= " | ||
+ | } | ||
+ | if ( $busy_hosts ne "" | ||
+ | $result .= " | ||
+ | } | ||
+ | if ( $unavail_hosts ne "" | ||
+ | $result .= " | ||
+ | } | ||
+ | return $result; | ||
+ | } | ||
+ | |||
+ | sub percent { | ||
+ | my ( $value, $total ) = @_; | ||
+ | my $percent = ( $value / $total ) * 100; | ||
+ | return sprintf( " | ||
+ | } | ||
+ | |||
+ | sub print_classes { | ||
+ | my $result = ""; | ||
+ | $result .= " | ||
+ | foreach my $my_class ( sort keys %class ) { | ||
+ | # fully populate the variables we need | ||
+ | my @variables = ( $OK, $CL_FULL, $CL_BUSY, $CL_ADM, $UNAVAILABLE, | ||
+ | foreach my $v ( @variables ) { | ||
+ | unless ( defined( $class{$my_class}{$v} ) ) { | ||
+ | $class{$my_class}{$v} = 0; | ||
+ | } | ||
+ | } | ||
+ | my $others = $class{$my_class}{$CL_ADM} | ||
+ | + $class{$my_class}{$UNAVAILABLE} | ||
+ | + $class{$my_class}{$UNREACHABLE}; | ||
+ | |||
+ | my $total = $others + $class{$my_class}{$OK} | ||
+ | + $class{$my_class}{$CL_FULL} | ||
+ | + $class{$my_class}{$CL_BUSY}; | ||
+ | |||
+ | $result .= $my_class." | ||
+ | $result .= percent( $class{$my_class}{$CL_FULL}, | ||
+ | $result .= " | ||
+ | $result .= percent( $class{$my_class}{$OK}, | ||
+ | $result .= " | ||
+ | $result .= percent( $class{$my_class}{$CL_BUSY}, | ||
+ | $result .= " | ||
+ | $result .= percent( $others, $total ); | ||
+ | $result .= " | ||
+ | } | ||
+ | return $result; | ||
+ | } | ||
+ | |||
+ | sub print_summary { | ||
+ | my $result = "Hosts by status\n"; | ||
+ | $result .= sprintf( " | ||
+ | $result .= sprintf( " | ||
+ | $result .= sprintf( " | ||
+ | $result .= sprintf( " | ||
+ | $result .= sprintf( " | ||
+ | return $result; | ||
+ | } | ||
+ | |||
+ | # a non efficient job is a job where there is a non cpu constraint, i-e that | ||
+ | # sits on the farm doing nothing or just about | ||
+ | # The way I used to measure it was not fine-grained enough so I removed this | ||
+ | # feature for the time being | ||
+ | # | ||
+ | #sub print_non_efficient { | ||
+ | # my $result = ""; | ||
+ | # if ( $non_efficient ne "" | ||
+ | # $result .= " | ||
+ | # } | ||
+ | # return $result; | ||
+ | #} | ||
+ | |||
+ | sub print_misc_messages { | ||
+ | my $result = ""; | ||
+ | if ( $misc ne "" | ||
+ | $result .= " | ||
+ | } | ||
+ | return $result; | ||
+ | } | ||
+ | |||
+ | sub process_lsload { | ||
+ | foreach my $line ( @lsload ) { | ||
+ | unless ( $line =~ / | ||
+ | my @fields = split( " ", $line ); | ||
+ | $hash{$fields[0]}{cpu_usage} = $fields[5]; | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | |||
+ | sub process_bhosts { | ||
+ | foreach my $line ( @bhosts ) { | ||
+ | unless ( $line =~ / | ||
+ | my @fields = split( " ", $line ); | ||
+ | $hash{$fields[0]}{status} = $fields[1]; | ||
+ | $hash{$fields[0]}{max_jobs} = $fields[3]; | ||
+ | $hash{$fields[0]}{nb_jobs} = $fields[4]; | ||
+ | $hash{$fields[0]}{run_jobs} = $fields[5]; | ||
+ | $hash{$fields[0]}{sys_susp} = $fields[6]; | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | |||
+ | sub process_host { | ||
+ | my ( $host ) = @_; | ||
+ | |||
+ | # add the host to a status class | ||
+ | if ( $hash{$host}{status} =~ qr/$CL_ADM/ ) { | ||
+ | add_to_closed_adm( $host ); | ||
+ | } | ||
+ | elsif ( $hash{$host}{status} =~ qr/ | ||
+ | add_to_busy( $host ); | ||
+ | } | ||
+ | elsif ( $hash{$host}{status} =~ qr/ | ||
+ | add_to_full( $host ); | ||
+ | } | ||
+ | elsif ( $hash{$host}{status} =~ qr/$OK/ ) { | ||
+ | add_to_ok( $host ); | ||
+ | } | ||
+ | elsif ( $hash{$host}{status} =~ qr/ | ||
+ | add_to_unavail( $host ); | ||
+ | } | ||
+ | elsif ( $hash{$host}{status} =~ qr/ | ||
+ | add_to_unreach( $host ); | ||
+ | } | ||
+ | } | ||
+ | |||
+ | sub send_report { | ||
+ | my ( $statusmsg ) = @_; | ||
+ | if ( ( $non_efficient ne "" | ||
+ | $color = " | ||
+ | } | ||
+ | # Build the command we use to send a status to the Xymon daemon | ||
+ | my $cmd = $bb . " " . $bbdisp . " \" | ||
+ | # And send the message | ||
+ | system $cmd; | ||
+ | } | ||
+ | |||
+ | #### MAIN | ||
+ | |||
+ | # Get the BB and BBDISP environment settings. | ||
+ | $bb = $ENV{" | ||
+ | $bbdisp = $ENV{" | ||
+ | |||
+ | my $statusmsg = ""; | ||
+ | |||
+ | # get the lsf reports | ||
+ | process_lsload; | ||
+ | process_bhosts; | ||
+ | |||
+ | # get host status | ||
+ | foreach my $host ( sort keys %hash ) { | ||
+ | print " | ||
+ | process_host( $host ); | ||
+ | } | ||
+ | |||
+ | $statusmsg .= print_summary; | ||
+ | $statusmsg .= print_classes if $USE_CLASSES; | ||
+ | $statusmsg .= print_non_full_or_ok; | ||
+ | #$statusmsg .= print_non_efficient; | ||
+ | $statusmsg .= print_misc_messages; | ||
+ | send_report( $statusmsg ); | ||
+ | </ | ||
+ | </ | ||
+ | |||
+ | ===== Known Bugs and Issues ===== | ||
+ | |||
+ | ===== To Do ===== | ||
+ | |||
+ | ===== Credits ===== | ||
+ | |||
+ | ===== Changelog ===== | ||
+ | |||
+ | * **2007-04-08** | ||
+ | * Initial release | ||