no way to compare when less than two revisions
Differences
This shows you the differences between two versions of the page.
| — | monitors:lsf_mon [2009/11/23 05:51] (current) – created - external edit 127.0.0.1 | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | ====== lsf_mon ====== | ||
| + | |||
| + | ^ Author | [[ gn1@sanger.ac.uk | Gildas Le Nadan ]] | | ||
| + | ^ Compatibility | Xymon 4.2 | | ||
| + | ^ Requirements | Perl, unix, Platform LSF | | ||
| + | ^ Download | None | | ||
| + | ^ Last Update | 2007-04-08 | | ||
| + | |||
| + | ===== Description ===== | ||
| + | |||
| + | This script allows monitoring of the hpc farm members. | ||
| + | |||
| + | It displays the number of hosts with each state in a ncv-compatible manner, and gives a list of hosts with a status of " | ||
| + | |||
| + | You can also regroup statistics per "host classes" | ||
| + | |||
| + | Here is a sample report | ||
| + | |||
| + | < | ||
| + | LSF report | ||
| + | |||
| + | Hosts by status | ||
| + | closed administratively: | ||
| + | closed busy : 0 | ||
| + | closed full : 580 | ||
| + | ok : | ||
| + | unavail | ||
| + | |||
| + | Statistics per class (%) | ||
| + | CLASS FULL OK BUSY OTHER | ||
| + | bc-1 93 0 0 7 | ||
| + | bc-10 100 0 0 0 | ||
| + | bc-2 95 0 0 5 | ||
| + | bc-3 90 0 0 10 | ||
| + | bc-4 95 0 0 5 | ||
| + | bc-5 97 0 0 3 | ||
| + | bc-6 100 0 0 0 | ||
| + | bc-7 100 0 0 0 | ||
| + | bc-8 100 0 0 0 | ||
| + | bc-9 100 0 0 0 | ||
| + | pingu 0 100 0 0 | ||
| + | turing | ||
| + | |||
| + | Closed adm hosts: | ||
| + | | ||
| + | |||
| + | Unavail hosts: | ||
| + | | ||
| + | </ | ||
| + | ===== Installation ===== | ||
| + | === Client side === | ||
| + | * Copy the script on a farm node in client' | ||
| + | * Add an entry in the clientlaunch.cfg file on the hosts running the script to run the script. Example on a linux/ | ||
| + | # lsf stats | ||
| + | [lsf] | ||
| + | | ||
| + | CMD / | ||
| + | | ||
| + | </ | ||
| + | |||
| + | === Server side === | ||
| + | * In hobbitserver.cfg: | ||
| + | - Append " | ||
| + | - Append " | ||
| + | - Add the following line for NCV:< | ||
| + | NCV_lsf=" | ||
| + | </ | ||
| + | * You need an entry in one of the bb-hosts file:< | ||
| + | 1.2.3.4 | ||
| + | </ | ||
| + | * Edit the hobbitgraph.cfg configuration to add:< | ||
| + | [lsf] | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | TITLE LSF status | ||
| + | -l 0 | ||
| + | YAXIS # | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | </ | ||
| + | ===== Source ===== | ||
| + | ==== lsf_mon.pl ==== | ||
| + | <hidden onHidden=" | ||
| + | <code perl> | ||
| + | # | ||
| + | # | ||
| + | # client-side script to monitor a lsf HPC farm | ||
| + | # | ||
| + | # copyright 2006 - 2007 Genome Research Limited / Gildas Le Nadan | ||
| + | # This script is released under the Gnu Public | ||
| + | # License (GPL) version 2 and Later | ||
| + | |||
| + | my $version = 1.0.0; | ||
| + | |||
| + | use strict; | ||
| + | |||
| + | #### PARAMETERS YOU CAN TWEAK | ||
| + | |||
| + | # you can debug this script in your environment by setting 1 below and running | ||
| + | # BB=echo BBDISP=127.0.0.1 ./ | ||
| + | my $DEBUG = 0; # 1 for debug, 0 otherwise | ||
| + | |||
| + | # how the lsf commands must be run | ||
| + | my @lsload=`. / | ||
| + | my @bhosts=`. / | ||
| + | |||
| + | # lines to ignore in the outputs | ||
| + | my $bhosts_banner=" | ||
| + | my $lsload_banner=" | ||
| + | |||
| + | # classes are a way to regroup the results for certain hosts that a part of a | ||
| + | # group/ | ||
| + | # must be related to the class they are part of | ||
| + | my $USE_CLASSES = 1; # 1 if we use the concept of server classes, 0 otherwise | ||
| + | |||
| + | # how to split our hostnames into a classes | ||
| + | my $class_regex = " | ||
| + | |||
| + | # hobbit config | ||
| + | my $farm = " | ||
| + | my $hobbitcolumn = " | ||
| + | my $color = " | ||
| + | my $summary = "LSF report"; | ||
| + | |||
| + | #### INTERNAL PARAMETERS (NO TWEAKING REQUIRED) | ||
| + | |||
| + | my $bb = ""; | ||
| + | my $bbdisp = ""; | ||
| + | |||
| + | my %hash = (); | ||
| + | my %class = (); | ||
| + | |||
| + | my $admclosed_hosts = ""; | ||
| + | my $busy_hosts = ""; | ||
| + | my $unavail_hosts = ""; | ||
| + | my $unreach_hosts = ""; | ||
| + | my $non_efficient = ""; | ||
| + | my $misc = ""; | ||
| + | |||
| + | my $closed_full_counter = 0; | ||
| + | my $closed_busy_counter = 0; | ||
| + | my $closed_adm_counter = 0; | ||
| + | my $ok_counter = 0; | ||
| + | my $unavail_counter = 0; | ||
| + | my $unreach_counter = 0; | ||
| + | |||
| + | my $CL_ADM = " | ||
| + | my $CL_BUSY = " | ||
| + | my $CL_FULL = " | ||
| + | my $OK = " | ||
| + | my $UNREACHABLE = " | ||
| + | my $UNAVAILABLE = " | ||
| + | |||
| + | ### FUNCTIONS | ||
| + | |||
| + | sub add_to_class { | ||
| + | my ( $status, $host ) = @_; | ||
| + | if ( $host =~ qr/ | ||
| + | my $my_class = $1; | ||
| + | if ( ! defined $class{$my_class}{$status} ) { | ||
| + | $class{$my_class}{$status} = 0; | ||
| + | } | ||
| + | $class{$my_class}{$status}++; | ||
| + | } | ||
| + | } | ||
| + | |||
| + | sub add_to_closed_adm { | ||
| + | my ( $host ) = @_; | ||
| + | $closed_adm_counter++; | ||
| + | $admclosed_hosts .= " $host"; | ||
| + | add_to_class( $CL_ADM, $host) if $USE_CLASSES; | ||
| + | } | ||
| + | |||
| + | sub add_to_busy { | ||
| + | my ( $host ) = @_; | ||
| + | $closed_busy_counter++; | ||
| + | $busy_hosts .= " $host"; | ||
| + | add_to_class( $CL_BUSY, $host) if $USE_CLASSES; | ||
| + | } | ||
| + | |||
| + | sub add_to_ok { | ||
| + | my ( $host ) = @_; | ||
| + | $ok_counter++; | ||
| + | add_to_class( $OK, $host) if $USE_CLASSES; | ||
| + | } | ||
| + | |||
| + | sub add_to_full { | ||
| + | my ( $host ) = @_; | ||
| + | $closed_full_counter++; | ||
| + | add_to_class( $CL_FULL, $host) if $USE_CLASSES; | ||
| + | } | ||
| + | |||
| + | sub add_to_unavail { | ||
| + | my ( $host ) = @_; | ||
| + | $unavail_counter++; | ||
| + | $unavail_hosts .= " $host"; | ||
| + | add_to_class( $UNAVAILABLE, | ||
| + | } | ||
| + | |||
| + | sub add_to_unreach { | ||
| + | my ( $host ) = @_; | ||
| + | $unreach_counter++; | ||
| + | $unreach_hosts .= " $host"; | ||
| + | add_to_class( $UNREACHABLE, | ||
| + | } | ||
| + | |||
| + | sub print_non_full_or_ok { | ||
| + | my $result = ""; | ||
| + | if ( $admclosed_hosts ne "" | ||
| + | $result .= " | ||
| + | } | ||
| + | if ( $busy_hosts ne "" | ||
| + | $result .= " | ||
| + | } | ||
| + | if ( $unavail_hosts ne "" | ||
| + | $result .= " | ||
| + | } | ||
| + | return $result; | ||
| + | } | ||
| + | |||
| + | sub percent { | ||
| + | my ( $value, $total ) = @_; | ||
| + | my $percent = ( $value / $total ) * 100; | ||
| + | return sprintf( " | ||
| + | } | ||
| + | |||
| + | sub print_classes { | ||
| + | my $result = ""; | ||
| + | $result .= " | ||
| + | foreach my $my_class ( sort keys %class ) { | ||
| + | # fully populate the variables we need | ||
| + | my @variables = ( $OK, $CL_FULL, $CL_BUSY, $CL_ADM, $UNAVAILABLE, | ||
| + | foreach my $v ( @variables ) { | ||
| + | unless ( defined( $class{$my_class}{$v} ) ) { | ||
| + | $class{$my_class}{$v} = 0; | ||
| + | } | ||
| + | } | ||
| + | my $others = $class{$my_class}{$CL_ADM} | ||
| + | + $class{$my_class}{$UNAVAILABLE} | ||
| + | + $class{$my_class}{$UNREACHABLE}; | ||
| + | |||
| + | my $total = $others + $class{$my_class}{$OK} | ||
| + | + $class{$my_class}{$CL_FULL} | ||
| + | + $class{$my_class}{$CL_BUSY}; | ||
| + | |||
| + | $result .= $my_class." | ||
| + | $result .= percent( $class{$my_class}{$CL_FULL}, | ||
| + | $result .= " | ||
| + | $result .= percent( $class{$my_class}{$OK}, | ||
| + | $result .= " | ||
| + | $result .= percent( $class{$my_class}{$CL_BUSY}, | ||
| + | $result .= " | ||
| + | $result .= percent( $others, $total ); | ||
| + | $result .= " | ||
| + | } | ||
| + | return $result; | ||
| + | } | ||
| + | |||
| + | sub print_summary { | ||
| + | my $result = "Hosts by status\n"; | ||
| + | $result .= sprintf( " | ||
| + | $result .= sprintf( " | ||
| + | $result .= sprintf( " | ||
| + | $result .= sprintf( " | ||
| + | $result .= sprintf( " | ||
| + | return $result; | ||
| + | } | ||
| + | |||
| + | # a non efficient job is a job where there is a non cpu constraint, i-e that | ||
| + | # sits on the farm doing nothing or just about | ||
| + | # The way I used to measure it was not fine-grained enough so I removed this | ||
| + | # feature for the time being | ||
| + | # | ||
| + | #sub print_non_efficient { | ||
| + | # my $result = ""; | ||
| + | # if ( $non_efficient ne "" | ||
| + | # $result .= " | ||
| + | # } | ||
| + | # return $result; | ||
| + | #} | ||
| + | |||
| + | sub print_misc_messages { | ||
| + | my $result = ""; | ||
| + | if ( $misc ne "" | ||
| + | $result .= " | ||
| + | } | ||
| + | return $result; | ||
| + | } | ||
| + | |||
| + | sub process_lsload { | ||
| + | foreach my $line ( @lsload ) { | ||
| + | unless ( $line =~ / | ||
| + | my @fields = split( " ", $line ); | ||
| + | $hash{$fields[0]}{cpu_usage} = $fields[5]; | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | |||
| + | sub process_bhosts { | ||
| + | foreach my $line ( @bhosts ) { | ||
| + | unless ( $line =~ / | ||
| + | my @fields = split( " ", $line ); | ||
| + | $hash{$fields[0]}{status} = $fields[1]; | ||
| + | $hash{$fields[0]}{max_jobs} = $fields[3]; | ||
| + | $hash{$fields[0]}{nb_jobs} = $fields[4]; | ||
| + | $hash{$fields[0]}{run_jobs} = $fields[5]; | ||
| + | $hash{$fields[0]}{sys_susp} = $fields[6]; | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | |||
| + | sub process_host { | ||
| + | my ( $host ) = @_; | ||
| + | |||
| + | # add the host to a status class | ||
| + | if ( $hash{$host}{status} =~ qr/$CL_ADM/ ) { | ||
| + | add_to_closed_adm( $host ); | ||
| + | } | ||
| + | elsif ( $hash{$host}{status} =~ qr/ | ||
| + | add_to_busy( $host ); | ||
| + | } | ||
| + | elsif ( $hash{$host}{status} =~ qr/ | ||
| + | add_to_full( $host ); | ||
| + | } | ||
| + | elsif ( $hash{$host}{status} =~ qr/$OK/ ) { | ||
| + | add_to_ok( $host ); | ||
| + | } | ||
| + | elsif ( $hash{$host}{status} =~ qr/ | ||
| + | add_to_unavail( $host ); | ||
| + | } | ||
| + | elsif ( $hash{$host}{status} =~ qr/ | ||
| + | add_to_unreach( $host ); | ||
| + | } | ||
| + | } | ||
| + | |||
| + | sub send_report { | ||
| + | my ( $statusmsg ) = @_; | ||
| + | if ( ( $non_efficient ne "" | ||
| + | $color = " | ||
| + | } | ||
| + | # Build the command we use to send a status to the Xymon daemon | ||
| + | my $cmd = $bb . " " . $bbdisp . " \" | ||
| + | # And send the message | ||
| + | system $cmd; | ||
| + | } | ||
| + | |||
| + | #### MAIN | ||
| + | |||
| + | # Get the BB and BBDISP environment settings. | ||
| + | $bb = $ENV{" | ||
| + | $bbdisp = $ENV{" | ||
| + | |||
| + | my $statusmsg = ""; | ||
| + | |||
| + | # get the lsf reports | ||
| + | process_lsload; | ||
| + | process_bhosts; | ||
| + | |||
| + | # get host status | ||
| + | foreach my $host ( sort keys %hash ) { | ||
| + | print " | ||
| + | process_host( $host ); | ||
| + | } | ||
| + | |||
| + | $statusmsg .= print_summary; | ||
| + | $statusmsg .= print_classes if $USE_CLASSES; | ||
| + | $statusmsg .= print_non_full_or_ok; | ||
| + | #$statusmsg .= print_non_efficient; | ||
| + | $statusmsg .= print_misc_messages; | ||
| + | send_report( $statusmsg ); | ||
| + | </ | ||
| + | </ | ||
| + | |||
| + | ===== Known Bugs and Issues ===== | ||
| + | |||
| + | ===== To Do ===== | ||
| + | |||
| + | ===== Credits ===== | ||
| + | |||
| + | ===== Changelog ===== | ||
| + | |||
| + | * **2007-04-08** | ||
| + | * Initial release | ||