| 1 |
#!/bin/sh |
|---|
| 2 |
#set -xv |
|---|
| 3 |
# |
|---|
| 4 |
# Runs the scripts/check_galaxy.py script in a way that's easy to handle from cron |
|---|
| 5 |
# |
|---|
| 6 |
|
|---|
| 7 |
# defaults (note: default sleep is below since it depends on debug) |
|---|
| 8 |
DEBUG=0 |
|---|
| 9 |
STAGGER=0 |
|---|
| 10 |
INTERVAL=3 |
|---|
| 11 |
MAIL= |
|---|
| 12 |
PAGE= |
|---|
| 13 |
NEWHIST= |
|---|
| 14 |
BARDARG=0 |
|---|
| 15 |
# get commandline opts |
|---|
| 16 |
while getopts dsi:l:m:p:n optname |
|---|
| 17 |
do |
|---|
| 18 |
case $optname in |
|---|
| 19 |
d) DEBUG=1 ;; |
|---|
| 20 |
s) STAGGER=1 ;; |
|---|
| 21 |
i) INTERVAL=$OPTARG ;; |
|---|
| 22 |
l) SLEEP=$OPTARG ;; |
|---|
| 23 |
m) MAIL="$MAIL $OPTARG" ;; |
|---|
| 24 |
p) PAGE="$PAGE $OPTARG" ;; |
|---|
| 25 |
n) NEWHIST="-n" ;; |
|---|
| 26 |
*) BADARG=1 ;; |
|---|
| 27 |
esac |
|---|
| 28 |
done |
|---|
| 29 |
shift `expr $OPTIND - 1` |
|---|
| 30 |
|
|---|
| 31 |
if [ -z "$1" -o "$BADARG" ]; then |
|---|
| 32 |
cat <<EOF |
|---|
| 33 |
usage: `basename $0` [-ds] [-i interval] [-m email_address]+ [-p pager_address]+ <galaxy_host>" |
|---|
| 34 |
-d Print debugging information. |
|---|
| 35 |
-s Stagger mailing the pagers/emails, instead of all at once when |
|---|
| 36 |
there's a problem. Useful for running check_galaxy at night. |
|---|
| 37 |
-i <interval> The number of times this wrapper should execute before mailing |
|---|
| 38 |
the next address, when staggering is enabled. Mail is sent |
|---|
| 39 |
every <interval> runs of the program, so the actual time |
|---|
| 40 |
between emails is: |
|---|
| 41 |
time = (<interval>) * (how often wrapper runs from cron) |
|---|
| 42 |
-l <seconds> This wrapper runs check_galaxy a second time if the first check |
|---|
| 43 |
fails, in case the problem is intermittent. <seconds> is how |
|---|
| 44 |
many seconds to sleep between checks. |
|---|
| 45 |
-m <address> Email addresses to send the full check_galaxy output to, if |
|---|
| 46 |
Galaxy is down. Use multiple -m options to specify multiple |
|---|
| 47 |
addresses. When staggering, email will be sent in the order |
|---|
| 48 |
which you specify -m options on the command line. |
|---|
| 49 |
-p <address> Like -m, but sends just the last line of check_galaxy's output. |
|---|
| 50 |
Useful for pagers. When staggering is enabled and both -m and |
|---|
| 51 |
-p options are present, the first -m address and the first -p |
|---|
| 52 |
address are mailed simultaneously, followed by the second -m |
|---|
| 53 |
and second -p, and so on. |
|---|
| 54 |
-n Create a new history (passes the -n option to check_galaxy.py). |
|---|
| 55 |
<galaxy_host> The hostname of the Galaxy server to check. Use a : if running |
|---|
| 56 |
on a non-80 port (e.g. galaxy.example.com:8080). |
|---|
| 57 |
EOF |
|---|
| 58 |
exit 1 |
|---|
| 59 |
fi |
|---|
| 60 |
|
|---|
| 61 |
if [ -z "$SLEEP" ]; then |
|---|
| 62 |
if [ $DEBUG ]; then |
|---|
| 63 |
SLEEP=2 |
|---|
| 64 |
else |
|---|
| 65 |
SLEEP=60 |
|---|
| 66 |
fi |
|---|
| 67 |
fi |
|---|
| 68 |
|
|---|
| 69 |
# globals |
|---|
| 70 |
CRON_DIR=`dirname $0` |
|---|
| 71 |
SCRIPTS_DIR="$CRON_DIR/../scripts" |
|---|
| 72 |
CHECK_GALAXY="$SCRIPTS_DIR/check_galaxy.py" |
|---|
| 73 |
VAR="$HOME/.check_galaxy" |
|---|
| 74 |
|
|---|
| 75 |
# sanity |
|---|
| 76 |
if [ ! -f $CHECK_GALAXY ]; then |
|---|
| 77 |
[ $DEBUG = 1 ] && echo "$CHECK_GALAXY is missing" |
|---|
| 78 |
exit 0 |
|---|
| 79 |
fi |
|---|
| 80 |
|
|---|
| 81 |
# Do any other systems' default ps not take BSD ps args? |
|---|
| 82 |
case `uname -s` in |
|---|
| 83 |
SunOS) PS="/usr/ucb/ps" ;; |
|---|
| 84 |
*) PS="ps" ;; |
|---|
| 85 |
esac |
|---|
| 86 |
|
|---|
| 87 |
NOTIFIED_MAIL="$VAR/$1/mail" |
|---|
| 88 |
NOTIFIED_PAGE="$VAR/$1/page" |
|---|
| 89 |
MUTEX="$VAR/$1/wrap.mutex" |
|---|
| 90 |
COUNT="$VAR/$1/wrap.count" |
|---|
| 91 |
STAGGER_FILE="$VAR/$1/wrap.stagger" |
|---|
| 92 |
for dir in $VAR/$1 $NOTIFIED_MAIL $NOTIFIED_PAGE; do |
|---|
| 93 |
if [ ! -d $dir ]; then |
|---|
| 94 |
mkdir -p -m 0700 $dir |
|---|
| 95 |
if [ $? -ne 0 ]; then |
|---|
| 96 |
[ $DEBUG = 1 ] && echo "unable to create dir: $dir" |
|---|
| 97 |
exit 0 |
|---|
| 98 |
fi |
|---|
| 99 |
fi |
|---|
| 100 |
done |
|---|
| 101 |
|
|---|
| 102 |
if [ ! -f "$VAR/$1/login" ]; then |
|---|
| 103 |
[ $DEBUG = 1 ] && cat <<EOF |
|---|
| 104 |
Please create the file: |
|---|
| 105 |
$VAR/$1/login |
|---|
| 106 |
This should contain a username and password to log in to |
|---|
| 107 |
Galaxy with, on one line, separated by whitespace, e.g.: |
|---|
| 108 |
|
|---|
| 109 |
check_galaxy@example.com password |
|---|
| 110 |
|
|---|
| 111 |
If the user does not exist, check_galaxy will create it |
|---|
| 112 |
for you. |
|---|
| 113 |
EOF |
|---|
| 114 |
exit 0 |
|---|
| 115 |
fi |
|---|
| 116 |
|
|---|
| 117 |
if [ $STAGGER ]; then |
|---|
| 118 |
if [ -f "$STAGGER_FILE" ]; then |
|---|
| 119 |
STAGGER_COUNT=`cat $STAGGER_FILE` |
|---|
| 120 |
else |
|---|
| 121 |
STAGGER_COUNT=$INTERVAL |
|---|
| 122 |
fi |
|---|
| 123 |
fi |
|---|
| 124 |
|
|---|
| 125 |
# only run one at once |
|---|
| 126 |
if [ -f $MUTEX ]; then |
|---|
| 127 |
pid=`cat $MUTEX` |
|---|
| 128 |
$PS p $pid >/dev/null 2>&1 |
|---|
| 129 |
if [ $? -eq 0 ]; then |
|---|
| 130 |
if [ -f $COUNT ]; then |
|---|
| 131 |
count=`cat $COUNT` |
|---|
| 132 |
else |
|---|
| 133 |
count=0 |
|---|
| 134 |
fi |
|---|
| 135 |
if [ "$count" -eq 3 ]; then |
|---|
| 136 |
echo "A check_galaxy process for $1 has been running for an unusually long time. Something is broken." \ |
|---|
| 137 |
| mail -s "$1 problems" $MAIL |
|---|
| 138 |
fi |
|---|
| 139 |
expr $count + 1 > $COUNT |
|---|
| 140 |
exit 0 |
|---|
| 141 |
else |
|---|
| 142 |
# stale mutex |
|---|
| 143 |
rm -f $MUTEX |
|---|
| 144 |
fi |
|---|
| 145 |
fi |
|---|
| 146 |
|
|---|
| 147 |
rm -f $COUNT |
|---|
| 148 |
echo $$ > $MUTEX |
|---|
| 149 |
|
|---|
| 150 |
[ $DEBUG = 1 ] && echo "running first check" |
|---|
| 151 |
first_try=`$CHECK_GALAXY $NEWHIST $1 2>&1` |
|---|
| 152 |
|
|---|
| 153 |
if [ $? -ne 0 ]; then |
|---|
| 154 |
# if failure, wait and try again |
|---|
| 155 |
[ $DEBUG = 1 ] && echo "first check failed, sleeping $SLEEP seconds for second run" |
|---|
| 156 |
sleep $SLEEP |
|---|
| 157 |
else |
|---|
| 158 |
# if successful |
|---|
| 159 |
[ $DEBUG = 1 ] && echo "first check succeeded" |
|---|
| 160 |
for file in $NOTIFIED_MAIL/* $NOTIFIED_PAGE/*; do |
|---|
| 161 |
recip=`basename $file` |
|---|
| 162 |
# the literal string including the * will be passed if the dir is empty |
|---|
| 163 |
[ "$recip" = '*' ] && continue |
|---|
| 164 |
echo "$1 is now okay" | mail -s "$1 OK" $recip |
|---|
| 165 |
rm -f $file |
|---|
| 166 |
[ $DEBUG = 1 ] && echo "up: mailed $recip" |
|---|
| 167 |
done |
|---|
| 168 |
rm -f $MUTEX $STAGGER_FILE |
|---|
| 169 |
exit 0 |
|---|
| 170 |
fi |
|---|
| 171 |
|
|---|
| 172 |
[ $DEBUG = 1 ] && echo "running second check" |
|---|
| 173 |
second_try=`$CHECK_GALAXY $NEWHIST $1 2>&1` |
|---|
| 174 |
|
|---|
| 175 |
if [ $? -ne 0 ]; then |
|---|
| 176 |
[ $DEBUG = 1 ] && echo "second check failed" |
|---|
| 177 |
if [ $STAGGER = 1 ]; then |
|---|
| 178 |
if [ "$STAGGER_COUNT" -eq "$INTERVAL" ]; then |
|---|
| 179 |
# send notification this run |
|---|
| 180 |
echo 1 > $STAGGER_FILE |
|---|
| 181 |
else |
|---|
| 182 |
# don't send notification this run |
|---|
| 183 |
[ $DEBUG = 1 ] && echo "$1 is down, but it's not time to send an email. STAGGER_COUNT was $STAGGER_COUNT" |
|---|
| 184 |
expr $STAGGER_COUNT + 1 > $STAGGER_FILE |
|---|
| 185 |
rm -f $MUTEX |
|---|
| 186 |
exit 0 |
|---|
| 187 |
fi |
|---|
| 188 |
fi |
|---|
| 189 |
for recip in $MAIL; do |
|---|
| 190 |
if [ ! -f "$NOTIFIED_MAIL/$recip" ]; then |
|---|
| 191 |
cat <<HERE | mail -s "$1 problems" $recip |
|---|
| 192 |
$second_try |
|---|
| 193 |
HERE |
|---|
| 194 |
touch "$NOTIFIED_MAIL/$recip" |
|---|
| 195 |
[ $DEBUG = 1 ] && echo "dn: mailed $recip" |
|---|
| 196 |
[ $STAGGER = 1 ] && break |
|---|
| 197 |
fi |
|---|
| 198 |
done |
|---|
| 199 |
for recip in $PAGE; do |
|---|
| 200 |
if [ ! -f "$NOTIFIED_PAGE/$recip" ]; then |
|---|
| 201 |
cat <<HERE | tail -1 | mail -s "$1 problems" $recip |
|---|
| 202 |
$second_try |
|---|
| 203 |
HERE |
|---|
| 204 |
touch "$NOTIFIED_PAGE/$recip" |
|---|
| 205 |
[ $DEBUG = 1 ] && echo "dn: mailed $recip" |
|---|
| 206 |
[ $STAGGER = 1 ] && break |
|---|
| 207 |
fi |
|---|
| 208 |
done |
|---|
| 209 |
else |
|---|
| 210 |
[ $DEBUG = 1 ] && echo "second check succeeded" |
|---|
| 211 |
for file in $NOTIFIED_MAIL/* $NOTIFIED_PAGE/*; do |
|---|
| 212 |
recip=`basename $file` |
|---|
| 213 |
[ "$recip" = '*' ] && continue |
|---|
| 214 |
echo "$1 is now okay" | mail -s "$1 OK" $recip |
|---|
| 215 |
rm -f $file |
|---|
| 216 |
[ $DEBUG = 1 ] && echo "up: mailed $recip" |
|---|
| 217 |
done |
|---|
| 218 |
rm -f $STAGGER_FILE |
|---|
| 219 |
fi |
|---|
| 220 |
|
|---|
| 221 |
rm -f $MUTEX |
|---|
| 222 |
exit 0 |
|---|