#!/usr/bin/perl -w
#
# remrrds, REMove_Round_Robin_Database_Spikes:
# This script removes spikes in an RRD. The lower bound and/or the upper bound
# for the legal values, that is the values to retain, can be specified. If any
# number in a row exceeds the boundaries, all values in that row will be set to
# 'NaN'. The same row for any other consolidation function will be cleared too.
#
# This script is based on script removespikes.pl, which removes all values which
# are outside the range of common values, which cover 0.99 of all values.
#
# An essential extension on the original script is that in the analysis phase
# also the consolidation functions are considered. A lower bound is applied to
# the RRA using consolidation function MIN, and any row to be deleted is also
# deleted in the RRA's with another consolidation function. The same goes for
# the upper bound and the RRA using consolidation function MAX.
#
# Another extension is the possibility to define one or more names of data sets.
# Only those data sets will be checked, the others will be skipped. By default
# all data sets will be checked.
#
# remrrds.pl [-dhv] [-l <X>] [-u <X>] [-s <N>] [-b <T>] <RrdName>
#
# Written by W.J.M. Nelis, wim.nelis@ziggo.nl, 2019.12
#
# To do:
# - Check size of newly created RRD, and restore old one if the new one is
# shorter.
#
use strict ;
use Getopt::Std ;
# Command line parameters.
#
my %opt= () ; # All command line parameters
my $DEBUG = undef ; # Flag: debug mode
my $VERBOSE= undef ; # Flag: verbose mode
my $LOWTIM = undef ; # Time before which all rows are deleted
my $LOWBND = undef ; # Lower bound of correct values
my $UPPBND = undef ; # Upper bound of correct values
my @DATASET= () ; # Data sets to check
my $RRDFIL = undef ; # Name of RRD file
#
# Define the names of the temporary files containing the XML version of the RRD.
#
my $XMLOLD= "/tmp/rrd.dump.old" ; # Name of XML dump of original RRD
my $XMLNEW= "/tmp/rrd.dump.new" ; # Name of modified XML dump
#
# In case multiple consolidation functions are used in the RRD, define the
# preferred list of consolidated data to look for out-of-bounds values. Note
# that it is not trivial to determine the preference of consolidation function
# LAST, as it's average value is AVERAGE but it's fluctuations will be larger.
#
my @pcfLOW= ( 'MIN', 'AVERAGE', 'LAST', 'MAX' ) ;
my @pcfUPP= ( 'MAX', 'AVERAGE', 'LAST', 'MIN' ) ;
# Global variables.
#
my $RrdMod= 0 ; # Flag: RRD is modified
my @xml= () ; # Save area XML file
my @ds = () ; # List of data sets used in RRD
my %ds = () ; # List of data sets to check
my %cf = () ; # Consolidation functions used in RRD
my $lbcf= undef ; # CF to use for lower bound, if specified
my $ubcf= undef ; # CF to use for upper bound, if specified
#
# Function AnalyseRrd takes the XML dump of an RRD and determines the data sets
# (DS's) in use and the consolidation functions (CF's) in use.
#
sub AnalyseRrd() {
#
# Determine the list of data sets as well as the consolidation functions in use
# in this RRD.
#
@ds= map { $1 if m/<name> (.+) <\/name>/ } grep /<name>/, @xml ;
%cf= map { $1 => 0 if m/<cf>([A-Z]+)<\/cf>/ } grep /<cf>/ , @xml ;
#
# Build a list of indices of the data sets to check. If the data sets to check
# are not specified, using parameter -s, all data sets will be checked.
#
@DATASET= @ds unless @DATASET ;
my $ds= '|' . join('|',@DATASET) . '|' ;
for ( my $i= 0 ; $i < @ds ; $i++ ) {
next unless index($ds,$ds[$i]) > 0 ;
$ds{$i}= $ds[$i] ;
} # of for
#
# Determine the CF to use when checking the lower bound (if specified) and the
# upper bound (if specified).
#
$lbcf= ( map { exists $cf{$_} ? $_ : () } @pcfLOW )[0] ;
$ubcf= ( map { exists $cf{$_} ? $_ : () } @pcfUPP )[0] ;
} # of AnalyseRrd
#
# Function CrackParameters extracts all parameters from the command line. The
# function returns a true value upon successful completion.
#
sub CrackParameters() {
return 0 unless getopts( "dhvb:l:s:u:", \%opt ) ;
# Check for the flags in the command line.
ShowHelp(), exit if $opt{h} || ($#ARGV < 0) ;
if ( $opt{d} ) {
$DEBUG = 1 ;
$VERBOSE= 1 ;
print "Enabling DEBUG mode\n" ;
} # of if
if ( $opt{v} ) {
$VERBOSE= 1 ;
print "Running in VERBOSE mode\n" ;
} # of if
# Check for the parameters with an associated value in the command line.
if ( $opt{b} ) {
$LOWTIM= $opt{b} ;
print " Time boundary set to $LOWTIM\n" if $VERBOSE ;
} # of if
if ( $opt{l} ) {
$LOWBND= $opt{l} ;
print " Lower bound set to $LOWBND\n" if $VERBOSE ;
} # of if
if ( $opt{s} ) {
@DATASET= split( /,/, $opt{s} ) ;
for ( my $i= $#DATASET ; $i >= 0 ; $i-- ) {
splice( @DATASET, $i ), next if $DATASET[$i]=~ m/^\s*$/ ;
$DATASET[$i]=~ s/\s//g ; # Remove blank spaces
} # of for
if ( scalar(@DATASET) == 1 ) {
print " Data set to check is $DATASET[0]\n" if $VERBOSE ;
} else {
print " Data sets to check are (" . join( ',', @DATASET) . ")\n" if $VERBOSE ;
} # of else
} # of if
if ( $opt{u} ) {
$UPPBND= $opt{u} ;
print " Upper bound set to $UPPBND\n" if $VERBOSE ;
} # of if
# One parameter should be left, the name of the file containing the RRD.
if ( $#ARGV == 0 ) {
if ( substr($ARGV[0],-4) eq '.rrd' ) {
if ( -f $ARGV[0] ) {
$RRDFIL= $ARGV[0] ; # Save name of file
print " Source is $RRDFIL\n" if $VERBOSE ;
} # of if
} # of if
} # of if
# Check for parameter consistency. At least one boundary needs to be defined.
# The name of the RRD must be defined.
return 0 unless defined $LOWBND or defined $UPPBND or defined $LOWTIM ;
return 0 unless defined $RRDFIL ;
die "Can't find file $RRDFIL\n" unless -f $RRDFIL ;
return 1 # All is well
} # of CrackParameters
#
# Function DumpRrd dumps the content of an RRD into an XML file. As this XML
# file will be read multiple times, it reads the file and stores the lines in
# array @xml.
#
sub DumpRrd() {
print " Dumping $RRDFIL to an XML file...\n" if $VERBOSE ;
system( "rrdtool dump $RRDFIL > $XMLOLD" ) == 0 or die "rrdtool dump failed: $?\n" ;
open ( FH, '<', $XMLOLD ) or die "Can't read $XMLOLD: $!\n" ;
chomp( @xml= <FH> ) ; # Read entire file
close( FH ) ;
} # of DumpRrd
#
# Function InstallNewRrd takes the new version of the XML dump and creates an
# updated version of the RRD. The original one is renamed with extension '.old'.
#
sub InstallNewRrd() {
my $RRDFILOLD= "$RRDFIL.old" ; # Name of file containing original RRD
print " Installing new version of RRD...\n" if $VERBOSE ;
rename( $RRDFIL, $RRDFILOLD ) ; # Rename original RRD file
#
system( "rrdtool restore $XMLNEW $RRDFIL" ) == 0 or die "rrdtool restore failed: $?\n" ;
my($mode,$uid,$gid)= (stat($RRDFILOLD))[2,4,5] or die "Cannot stat $RRDFILOLD: $!\n" ;
chmod $mode, $RRDFIL ;
chown $uid, $gid, $RRDFIL ;
} # of InstallNewRrd
#
# Function SaveXml writes the in-memory copy of the XML file to disk.
#
sub SaveXml() {
open ( FH, '>', $XMLNEW ) or die "Can't write $XMLNEW: $!\n" ;
print FH "$_\n" foreach ( @xml ) ;
close( FH ) ;
} # of SaveXml
#
# Function ShowHelp shows a short help screen on standard output.
#
sub ShowHelp() {
print "RemRRDS: Remove spikes from an RRD.\n\n" ;
print "Usage:\n" ;
print "$0 [-dhv] [-l number] [-u number] [-b timestamp] [-s name[,name]] filename\n\n" ;
print "Where:\n" ;
print " -b remove all lines before unix timestamp\n" ;
print " -d enable debug messages\n" ;
print " -h print this message\n" ;
print " -l set the lower bound\n" ;
print " -s define data sets to be checked" ;
print " -u set the upper bound\n" ;
print " -v Verbose mode, show progress messages\n" ;
print " filename is the name of the rrd file\n" ;
} # of ShowHelp
#
# Function StripSpikesOnePass scans the XML version of the RRD, and removes any
# spikes it finds. The result is saved in the in-memory copy of the XML file.
# The result of this function is true if the XML file is modified, and false
# otherwise. This function is used whenever there is only one consolidation
# function in use, thus if an out-of-bound value is found, the only line to be
# cleared is the one being processed.
#
sub StripSpikesOnePass() {
my $result= 0 ; # Function result, default to no change
my $modif ; # Flag: line image is modified
my $modcnt= 0 ; # Number of rows modified
my $tsom ; # Time stamp of measurement
my $i ; # Index of data set in row
print " Removing spikes in one pass...\n" if $VERBOSE ;
foreach ( @xml ) {
$modif= 0 ; # Clear flag
if ( m/<v>-?\d\.\d{9}e[+-]\d+<\/v>/ ) { # Check for a value in line image
if ( defined $LOWTIM ) {
$tsom= $1 if m/\/\s+(\d+)\s+\-\->\s+<row>/ ;
$modif= 1 if $tsom < $LOWTIM ;
} # of if
unless ( $modif ) {
$i= -1 ; # Preset data set index
foreach my $val ( m/(?<=>)(?:NaN|-?\d\.\d{9}e[+-]\d+)\b/g ) {
# See if this value (data set) needs to be checked.
$i++ ;
next if $val eq 'NaN' ;
next unless exists $ds{$i} ;
if ( defined $LOWBND ) {
$modif= 1 if $val < $LOWBND ;
} # of if
if ( defined $UPPBND ) {
$modif= 1 if $val > $UPPBND ;
} # of if
last if $modif ; # Small optimization
} # of foreach
} # of unless
if ( $modif ) {
s/>-?\d\.\d{9}e[+-]\d+</>NaN</g ; # Wipe out measurement(s)
$modcnt++ ; # Update modification count
$result= 1 ; # Function result
if ( $DEBUG ) {
$tsom= $1 if m/<!\-\-\s+(.+?)\s+\// ;
print " Removing measurement of $tsom\n" ;
} # of if
} # of if
} # of if
} # of while
return $result ;
} # of StripSpikesOnePass
#
# Function StripSpikesTwoPasses scans the XML version of the RRD, and removes
# any spikes it finds. The result is saved in the in-memory copy of the XML
# file. The result of this function is true if the XML file is modified, and
# false otherwise. This function is called if the RRD uses two or more
# consolidation functions. In the first pass the list of measurements to strip
# is determined using the preferred CF only. In the next pass, these
# measurements, identified by the number of primary data points per row and the
# time stamp, are cleared in each CF.
#
sub StripSpikesTwoPasses() {
my $pdppr= undef ; # Primary data points per row in RRA
my $cf = undef ; # Consolidation function in RRA
my $inrra= 0 ; # Flag: current line inside RRA def
my $skrra= 0 ; # Flag: skip current RRA
my $indb = 0 ; # Flag: current line in database
my $result= 0 ; # Function result, default to no change
my $modif ; # Flag: line image is modified
my $modcnt= 0 ; # Number of rows modified
my $tsom ; # Time stamp of measurement
my $i ; # Index of data set in row
print " Removing spikes in two passes...\n" if $VERBOSE ;
# Prepare a hash to store the identification of the measurements to be
# stripped. Such a measurement is identified by two numbers, the number of
# primary data points per row and the time stamp of the measurement.
#
my %sl= map { $1 => {} if m/row>(\d+)<\/pdp/ } grep /<pdp_per_row>/, @xml ;
#
# Phase A: Scan the appropate RRA's for out-of bound values. If one is found,
# the pair (pdp_per_row,timestamp) is saved in hash %sl.
#
foreach ( @xml ) {
$modif= 0 ; # Clear flag
unless ( $inrra ) { # If not in an RRA definition,
if ( m/^\s+<rra>$/ ) { # check for the start of it
$inrra= 1 ; # Set flag
$skrra= 0 ; # Reset skip flag
$indb = 0 ; # Reset in-database flag
next ; # Line done
} # of if
} # of unless
next unless $inrra ;
$inrra= 0, next if m/^\s+<\/rra>/ ;
next if $skrra ; # Done with line if skip flag set
unless ( $indb ) {
if ( m/^\s+<cf>([A-Z]+)<\/cf>$/ ) {
$cf= $1 ; # Save consolidation function name
$skrra= 1 ; # Assume this RRA does not need to examined
$skrra= 0 if defined $LOWBND and $cf eq $lbcf ;
$skrra= 0 if defined $UPPBND and $cf eq $ubcf ;
next ; # Line done
} # of if
$pdppr= $1, next if m/^\s+<pdp_per_row>(\d+)<\/pdp_per_row>/ ;
$indb = 1, next if m/^\s+<database>$/ ;
} # of unless
next unless $indb ;
$indb= 0, next if m/^\s+<\/database>$/ ;
if ( defined $LOWTIM ) {
$tsom= $1 if m/\/\s+(\d+)\s+\-\->\s+<row>/ ;
if ( $tsom < $LOWTIM ){
$modif= 1 if m/\.\d{9}e[+-]\d\d/ ;
} # of if
} # of if
unless ( $modif ) {
$i= -1 ; # Preset data set index
foreach my $val ( m/(?<=>)(?:NaN|-?\d\.\d{9}e[+-]\d+)\b/g ) {
# See if this value (data set) needs to be checked.
$i++ ;
next if $val eq 'NaN' ;
next unless exists $ds{$i} ;
if ( defined $LOWBND ) {
$modif= 1 if $val < $LOWBND ;
} # of if
if ( defined $UPPBND ) {
$modif= 1 if $val > $UPPBND ;
} # of if
last if $modif ; # Small optimization
} # of foreach
} # of unless
if ( $modif ) {
$tsom= $1 if m/\/\s+(\d+)\s+\-\->\s+<row>/ ;
$sl{$pdppr}{$tsom}= 0 ; # Clear these measurements
if ( $DEBUG ) {
$tsom= $1 if m/<!\-\-\s+(.+?)\s+\// ;
print " Marking measurement of $pdppr : $tsom\n" ;
} # of if
} # of if
} # of foreach
#
# Phase B: Scan the XML file again, and if in a database the time stamp of a
# row is marked in hash %sl, the row is cleared.
#
# $result= 0 ; # Preset function result
foreach ( @xml ) {
$pdppr= $1, next if m/^\s+<pdp_per_row>(\d+)<\/pdp_per_row>/ ;
next unless m/<row>/ ;
$tsom= $1 if m/\/\s+(\d+)\s+\-\->\s+<row>/ ;
next unless exists $sl{$pdppr}{$tsom} ;
s/>-?\d\.\d{9}e[+-]\d+</>NaN</g ; # Wipe out measurement(s)
$result= 1 ; # Update function result
if ( $DEBUG ) {
$tsom= $1 if m/<!\-\-\s+(.+?)\s+\// ;
print " Removing measurement of $pdppr : $tsom\n" ;
} # of if
} # of foreach
return $result ;
} # of StripSpikesTwoPasses
#
# MAIN PROGRAM.
# =============
#
unless ( CrackParameters ) { # Interpret the command line parameters
ShowHelp ; # Show help information
exit 0 ; # Stop this script
} # of unless
DumpRrd ; # Dump the RRD to a temporary XML file
AnalyseRrd ; # Determine consolidation functions in use
if ( defined $LOWTIM and not defined $LOWBND and not defined $UPPBND ) {
$RrdMod= 1 if StripSpikesOnePass ; # Strip spikes
} elsif ( keys %cf == 1 ) {
$RrdMod= 1 if StripSpikesOnePass ; # Strip spikes
} else {
$RrdMod= 1 if StripSpikesTwoPasses ; # Strip spikes
} # of else
if ( $RrdMod ) {
SaveXml ; # Write new XML file
InstallNewRrd ; # Create new version of RRD file
} # of if
END {
unlink $XMLOLD ; # Remove intermediate files
unlink $XMLNEW ;
# if $RRDFILOLD exists, but RRDFIL does not (or is shorter), remove $RRDFIL and
# rename $RRDFILOLD to $RRDFIL.
}