AWS nat monitor and route switch script

This script will monitor another NAT instance and take over its routes if communication with the other instance fails

[root@ip-10 bin]# cat nat_monitor.sh 
#!/bin/sh
# This script will monitor another NAT instance and take over its routes
# if communication with the other instance fails

# NAT instance variables
# Other instance's ID to check and route to grab if other node goes down
NAT_ID=i-0273ed20b000000
NAT_RT_ID=rtb-000000a

# My route to grab when I come back up
My_RT_ID=rtb-000000b

# Specify the EC2 region that this will be running in (e.g. https://ec2.us-east-1.amazonaws.com)
EC2_URL=https://ec2.cn-north-1.amazonaws.com.cn

# Health Check variables
Num_Pings=3
Ping_Timeout=1
Wait_Between_Pings=2
Wait_for_Instance_Stop=60
Wait_for_Instance_Start=300

# Run aws-apitools-common.sh to set up default environment variables and to
# leverage AWS security credentials provided by EC2 roles
. /etc/profile.d/aws-apitools-common.sh

# Determine the NAT instance private IP so we can ping the other NAT instance, take over
# its route, and reboot it.  Requires EC2 DescribeInstances, ReplaceRoute, and Start/RebootInstances
# permissions.  The following example EC2 Roles policy will authorize these commands:
# {
#  "Statement": [
#    {
#      "Action": [
#        "ec2:DescribeInstances",
#        "ec2:CreateRoute",
#        "ec2:ReplaceRoute",
#        "ec2:StartInstances",
#        "ec2:StopInstances"
#      ],
#      "Effect": "Allow",
#      "Resource": "*"
#    }
#  ]
# }

# Get this instance's ID
Instance_ID=`/usr/bin/curl --silent http://169.254.169.254/latest/meta-data/instance-id`
# Get the other NAT instance's IP
NAT_IP=`/opt/aws/bin/ec2-describe-instances $NAT_ID -U $EC2_URL | grep PRIVATEIPADDRESS -m 1 | awk '{print $2;}'`

echo `date` "-- Starting NAT monitor"
echo `date` "-- Adding this instance to $My_RT_ID default route on start"
/opt/aws/bin/ec2-replace-route $My_RT_ID -r 0.0.0.0/0 -i $Instance_ID -U $EC2_URL
# If replace-route failed, then the route might not exist and may need to be created instead
if [ "$?" != "0" ]; then
   /opt/aws/bin/ec2-create-route $My_RT_ID -r 0.0.0.0/0 -i $Instance_ID -U $EC2_URL
fi

while [ . ]; do
  # Check health of other NAT instance
  pingresult=`ping -c $Num_Pings -W $Ping_Timeout $NAT_IP | grep time= | wc -l`
  # Check to see if any of the health checks succeeded, if not
  if [ "$pingresult" == "0" ]; then
    # Set HEALTHY variables to unhealthy (0)
    ROUTE_HEALTHY=0
    NAT_HEALTHY=0
    STOPPING_NAT=0
    while [ "$NAT_HEALTHY" == "0" ]; do
      # NAT instance is unhealthy, loop while we try to fix it
      if [ "$ROUTE_HEALTHY" == "0" ]; then
    	echo `date` "-- Other NAT heartbeat failed, taking over $NAT_RT_ID default route"
    	/opt/aws/bin/ec2-replace-route $NAT_RT_ID -r 0.0.0.0/0 -i $Instance_ID -U $EC2_URL
	ROUTE_HEALTHY=1
      fi
      # Check NAT state to see if we should stop it or start it again
      NAT_STATE=`/opt/aws/bin/ec2-describe-instances $NAT_ID -U $EC2_URL | grep INSTANCE | awk '{print $5;}'`
      if [ "$NAT_STATE" == "stopped" ]; then
    	echo `date` "-- Other NAT instance stopped, starting it back up"
        /opt/aws/bin/ec2-start-instances $NAT_ID -U $EC2_URL
	NAT_HEALTHY=1
        sleep $Wait_for_Instance_Start
      else
	if [ "$STOPPING_NAT" == "0" ]; then
    	  echo `date` "-- Other NAT instance $NAT_STATE, attempting to stop for reboot"
	  /opt/aws/bin/ec2-stop-instances $NAT_ID -U $EC2_URL
	  STOPPING_NAT=1
	fi
        sleep $Wait_for_Instance_Stop
      fi
    done
  else
    sleep $Wait_Between_Pings
  fi
done

add script to crontab, will run auto after booting

[root@ip-10 bin]# crontab -l
@reboot /root/bin/nat_monitor.sh >> /tmp/nat_monitor.log