[linux-elitists] A better cron: large production environments

Tony Godshall togo at of.net
Mon May 10 10:05:33 PDT 2010


...
>> >That wrapper itself would be what cron would invoke, the wrapper would
>> >run the actual job (and handle the notifications/logging) independently
>> >of cron.
...

>> Don't forget it needs to bail if the last invocation didn't finish...

> Good point.  Lockfiles.  We're having a little issue with this ATM.

Here's a bash script[1] I run cron like so:

15,45 * * * * root /usr/local/sbin/donotify check_disk_space-11.1

It takes a config file like this[2].

It checks for a lockdir, and command executes, failures are noted, and
n failures in m minutes are notified by email.  It requires nothing on
the source besides bash and ssh and ssh access to a mailserver.  And
of course if you add non-email notification of some kind you don't
even need that.

My intent is to enhance it over time to include non-email notification
schemes, like you want as well.  Use it as you will, and please feed
back any enhancements you make.  In particular, I want to add
non-email notifications, and locking might only be 99.9% perfect (but
good enough for minute-level, i.e. cron, start time granularity).

Tony


[1] (/usr/local/sbin/donotify)

#!/bin/bash

export PATH=/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/root/bin

myname=$(basename "$0")
CFGDIR="/usr/local/etc/donotify"

tag="$1"
shift

if [ -z "$tag" ]; then
    echo "missing tag" >&2
    needhelp=x
else
    cfgfile="$CFGDIR/$tag"
    if [ ! -s "$cfgfile" ]
    then
        echo "no $cfgfile" >&2
        needhelp=x
    fi
fi


if [ -n "$needhelp" ]
then
    echo "USAGE: $myname tag" >&2
    echo "   does command in /usr/local/etc/donotify/tag, and if it
fails, notifies as per config there" >&2
    exit 1
fi


#assume other process is a zombie at this point
STALEMIN=120

#echo "tag[$tag]"
echo "cfgfile[$cfgfile]"
#echo ". $cfgfile"
. "$cfgfile"

lockdir=/var/run/$myname-$tag
if [ -s "$lockdir/pid" ]
then
  otherpid=$(cat "$lockdir/pid")
  if kill -0 $otherpid 2> /dev/null
  then
    touch -d "-$STALEMIN minutes" "$lockdir/touchback"
    if [ "$lockdir/pid" -nt "$lockdir/touchback" ]
    then
      ls -l "$lockdir/pid"
      echo "$myname: DEFER to $otherpid" >&2
      exit 0
    else
      echo "$myname: $lockdir/pid STALE: KILL $otherpid" >&2
      if kill $otherpid
      then
        sleep 1
        if kill -0 $otherpid 2> /dev/null
        then
          echo "$myname: HARDKILL $otherpid" >&2
          kill -9 $otherpid
        fi
      fi
    fi
  else
    echo "cleanup stale $lockdir" >&2
  fi
  rm -Rf "$lockdir"
fi
if ! mkdir "$lockdir"
then
    echo "$myname $$: race condition: abort"
    exit 1
fi
echo $$ > "$lockdir/pid"

trackincidentdir="/var/lib/donotify/$tag/trackincidents"
mkdir -p "$trackincidentdir" || exit $?
notifieddir="/var/lib/donotify/$tag/notified"
mkdir -p "$notifieddir" || exit $?

now=$(date +%Y%m%d-%H%M%S)
echo "executing: $cmd $*"
$cmd "$@" 2>&1 > /tmp/$$ 2>&1
rslt=$?
cat /tmp/$$
trackincidentfile="$trackincidentdir/$now.rslt=$rslt"
#touch "$trackincidentfile"
mv -f /tmp/$$ "$trackincidentfile"
ls -l "$trackincidentfile"

# remove any incidents beyond our horizons
for f in $(find "$trackincidentdir"/. -type f ! -mmin -$threshold_minutes)
do
    mkdir -p /var/log/donotify/incidents || exit $?
    ls -l "$f" >> /var/log/donotify/incidents/$tag
    cat "$f" >> /var/log/donotify/incidents/$tag
    /bin/rm "$f"
done

# remove any notifications beyond our horizons
for f in $(find "$notifieddir"/. -type f ! -mmin -$rereport_minutes)
do
    mkdir -p /var/log/donotify/notified || exit $?
    ls -l "$f" >> /var/log/donotify/notified/$tag
    /bin/rm "$f"
done

case "$rslt" in
0)
    if [ -n "$success_clears" ]
    then
        echo "success clears all incidents: clearing $trackincidentdir/."
        for f in $(find "$trackincidentdir"/. -type f)
        do
            ls -l "$f" >> /var/log/donotify/incidents/$tag
            cat "$f" >> /var/log/donotify/incidents/$tag
            /bin/rm "$f"
        done
    fi
    ;;
*)
    countfails=$(find "$trackincidentdir"/. -type f -name '*.rslt=[^0]*'|wc -l)
    echo "countfails=$countfails"
    if [ $countfails -gt $threshold_count ]
    then
        countnotifies=$(find "$notifieddir"/. -type f |wc -l)
        if [ $countnotifies -gt 0 ]
        then
            echo "skipping notify since notified $countnotifies times
in $rereport_minutes min"
        else
            subj="$report_email_subject"
            for target in $report_email_to
            do
                rm -f /tmp/$$
                #if echo "$countfails -gt $threshold_count" |
/usr/bin/ssh 254 /bin/mail -s '"'"$subj"'"' $target
                ( echo "$countfails -gt $threshold_count"
                  find "$trackincidentdir"/. -type f | sort )\
                > /tmp/$$
                for f in $(find "$trackincidentdir"/. -type f )
                do
                    ( ls -l $f
                      cat $f ) \
                    >> /tmp/$$
                done
                if cat /tmp/$$ \
                   | /usr/bin/ssh 254 /bin/mail -s '"'"$subj"'"' $target
                then
                    echo "$target" >> "$notifieddir"/$now
                fi
                rm -f /tmp/$$
            done
        fi
    fi
    ;;
esac

rm -Rf "$lockdir"
exit 0

[2] (/usr/local/etc/donotify/check_disk_space_11.1)

cmd='check_disk_space_ssh 192.168.11.1   / 90   /alt 90   /home 95'

#number of failures can occur in an interval before we notify, 1 means
immediate notify
threshold_count=3
threshold_minutes=360

#this means a success clears all failures: success_clears=x
#this means a success doesn't: success_clears=
success_clears=x

#how many minutes before we report the condition again
let rereport_minutes=8*60

report_email_to="togo at of.net emile at westernstatesglass.com"
report_email_subject="donotify $tag"
report_to_log="/var/log/$tag"


More information about the linux-elitists mailing list