Paas平台架构

1. 整体架构:

ui--> controller -> service --> mapper -->mysql     (后端,主要维护服务列表)

        ⬇

  AgentService --> sqlite        (虚机层面,主要维护当前虚机的运行服务,同时向脚本发布  install,start, stop , uninstall, take-over, monitor_trace, installall, uninstallall 等命令)

    ⬇

       脚本 (包括中间件mysql, redis, kafka, elk, nacos, sentinel, mogodb, skywallking 等的维护,接管的java服务的维护)

2. 整个平台精华全在脚本,后端主要是向脚本下发命令,维护库的信息等。

3. 脚本简述

 agentctl.sh 举例:

#!/bin/bash
# JAR 包目录
readonly PACKAGE_FULL_WAY=/opt/download/packages
# MySQL执行脚本目录
readonly SH_HOME=agentctl.sh
# JAR 包执行目录
readonly AGENT_INSTALL_HOME=/opt/agent

# JAR 包名称
readonly JAR_NAME=sitesupport-agent-0.0.1-SNAPSHOT.jar
readonly NODE_EXPORTER=node_exporter-1.1.2.linux-amd64.tar.gz

# 引入commmon.sh
# shellcheck disable=SC1091
source "${PACKAGE_FULL_WAY}"/common.sh || exit

function createSshkey() {
  if [ -e /root/.ssh ] && [ -e /root/.ssh/id_rsa ] && [ -e /root/.ssh/id_rsa.pub ]; then
    logInfo "ssh-key已存在!"
  else
    cd /root && if [ ! -e .ssh ]; then mkdir .ssh; fi
    cd .ssh || exit
    ssh-keygen -f "id_rsa" -N ""
    logInfo "ssh-key生成成功"
  fi
}

#check jdk
function checkJdk() {
  logInfo "start check jdk...."
  if java -version &>/dev/null; then
    logInfo "start remove old jdk..."
    yum remove jdk -y
    # shellcheck disable=SC1091
    source /etc/profile
  fi
  logInfo "start install new jdk..."
  if ! rpm -ivh ${PACKAGE_FULL_WAY}/"${JDK_PKG_NAME}"; then
    logError "jdk1.8.0_291 未安装成功,请重新安装!"
  fi
  # 允许jmx远程访问
  local jmxremote_conf=/usr/java/jdk1.8.0_291-amd64/jre/lib/management
  cd ${jmxremote_conf} || logError "${jmxremote_conf} 不存在!"
  cp jmxremote.password.template jmxremote.password
  chmod +w jmxremote.password
  echo "monitorRole QED" >>jmxremote.password
  echo "controlRole R&D" >>jmxremote.password
  chmod 0400 jmxremote.password
  logInfo "the jdk is installed and the environment variables are configured"
}

# 检查定时任务状态
function checkCrond() {
  local state=""
  state=$(systemctl status crond | awk 'NR==3{print}' | awk '{print $3}' | tail -c +2 | head -c -2)
  if [[ ${state} != "running" ]]; then
    # 启动定时任务服务
    service crond start
  fi

  # 设置cron开机自启
  systemctl enable crond.service
}

function installNodeExporter() {
  logInfo "start install node exporter..."
  if [ ! -e ${PACKAGE_FULL_WAY}/${NODE_EXPORTER} ]; then
    logInfo "node exporter不存在!"
    return
  fi
  # 解压node exporter到安装主目录
  mkdir ${AGENT_INSTALL_HOME}/node_exporter
  tar -zxvf ${PACKAGE_FULL_WAY}/${NODE_EXPORTER} -C ${AGENT_INSTALL_HOME}/node_exporter >/dev/null 2>&1
  checkResult $? "tar node exporter package error"
  local package_name=""
  # shellcheck disable=SC2010
  package_name=$(ls ${AGENT_INSTALL_HOME}/node_exporter | grep node_exporter)
  mv ${AGENT_INSTALL_HOME}/node_exporter/"${package_name}"/* ${AGENT_INSTALL_HOME}/node_exporter
  rm -rf ${AGENT_INSTALL_HOME}/node_exporter/"${package_name}"
  cd ${AGENT_INSTALL_HOME}/node_exporter || logError "${AGENT_INSTALL_HOME}/node_exporter 不存在!"

  if [ -e /usr/lib/systemd/system/node_exporter.service ]; then
    rm -rf /usr/lib/systemd/system/node_exporter.service &>/dev/null
  fi
  cat <<EOF >>/usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
After=network-online.target remote-fs.target nss-lookup.target
Wants=network-online.target

[Service]
Type=simple
ExecStart=${AGENT_INSTALL_HOME}/node_exporter/node_exporter
ExecReload=/bin/kill -s HUP $MAINPID
ExecStop=/bin/kill -s TERM $MAINPID

[Install]
WantedBy=multi-user.target
EOF

  systemctl daemon-reload
  systemctl enable node_exporter.service
  systemctl start node_exporter.service

  # 修改prometheus服务端配置
  # TODO
  local prometheus_ip=""
  if [[ ${prometheus_ip} = "" ]] || [[ ${prometheus_ip} = "null" ]]; then
    echo "下次一定!"
    # logInfo "nacos配置获取失败,开始从外部配置文件获取配置..."
    # i=0
    # temp=""
    # while true
    # do
    #   i=`expr $i + 1`
    #   temp=`sed -n "/^${i} /p" ${AGENT_INSTALL_HOME}/nacos_config | cut -d ' ' -f 2`
    #   if [[ ${temp} = "" ]];then break;fi;
    #   if [[ ${temp} =~ ^prometheus ]];then
    #     sed -n "/^${i} /,/^}$/p" ${AGENT_INSTALL_HOME}/nacos_config | sed -n -e '/^{$/,/^}$/p' | jq -r ".install_ip" > ip.txt
    #   fi
    # done
    # prometheus_ip=`cat ip.txt` && rm -rf ip.txt
  else
    rm -rf temp.json
    # shellcheck disable=SC2154
    sshpass -p "${linux_password}" ssh -n -o StrictHostKeyChecking=no root@"${prometheus_ip}" "cd /opt/sitesupport/prometheus-standalone &>/dev/null || exit;sh prometheusctl.sh add_exporter -j node-${localIp}-exporter -h ${localIp} -p 9100"
    return 0
  fi
}

function installAgent() {
  # 创建安装目录
  if [[ -e ${AGENT_INSTALL_HOME} ]]; then logError "安装目录[${AGENT_INSTALL_HOME}]已存在,请检查!"; fi
  mkdir ${AGENT_INSTALL_HOME}
  checkSshpass
  createSshkey
  checkCrond
  cp -f ${PACKAGE_FULL_WAY}/${JAR_NAME} ${AGENT_INSTALL_HOME}
  cp -f ${PACKAGE_FULL_WAY}/${SH_HOME} ${AGENT_INSTALL_HOME}
  cp -f ${PACKAGE_FULL_WAY}/common.sh ${AGENT_INSTALL_HOME}
  cp -f ${PACKAGE_FULL_WAY}/constant.sh ${AGENT_INSTALL_HOME}
  cp ${PACKAGE_FULL_WAY}/agent.db ${AGENT_INSTALL_HOME}
  chmod 755 ${AGENT_INSTALL_HOME}/${SH_HOME}
  # 增加定时任务
  echo "*/1 * * * * root \`cd /opt/agent && sh agentctl.sh self_healing\`" >>/etc/crontab
  logInfo "config jar finish"
}

function install() {
  judgeMem 1024000
  checkDepend
  installAgent
  start
  installNodeExporter
}

function print() {
  echo -e "====================== sitesupport-agent 启动完成 ======================

=                   private: http://${localIp}:8888                  =

========================================================================"
}

function start() {
  local step=5
  local res=1
  local bool=1
  for ((i = 0; i < 60; i = (i + step))); do
    serviceIsAlive
    res=$?
    if [ ${res} = 1 ]; then
      nohup java -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=18888 -jar ${AGENT_INSTALL_HOME}/${JAR_NAME} >${AGENT_INSTALL_HOME}/nohup.out 2>&1 &
      logInfo "${JAR_NAME}服务启动中..."
    elif [ ${res} = 2 ]; then
      logInfo "${JAR_NAME}服务启动中..."
      bool=1
    else
      logInfo "${JAR_NAME}服务已正常启动!"
      bool=0
      print
      return
    fi
    sleep $step
  done

  # 启动node-exporter
  systemctl start node_exporter.service
  if [ ${bool} = 1 ]; then
    local pid=""
    pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}')
    kill -9 "${pid}"
    logError "${JAR_NAME}服务启动失败!i will kill it!!"
  fi
}

function stop() {
  if serviceIsAlive; then
    local pid=""
    pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}')
    kill -9 "${pid}"
    logInfo "${JAR_NAME}停止成功"
  else
    logInfo "${JAR_NAME}没有启动"
  fi

  local node_pid=""
  node_pid=$(netstat -tnlp | grep 9100 | grep node_exporter | awk '{print $7}' | awk 'NR==1' | cut -d '/' -f 1)
  if [[ ${node_pid} != "" ]]; then kill -9 "${node_pid}"; fi
}

function serviceIsAlive() {
  setLocalIp
  local pid=""
  pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}')

  # 如果不存在返回1,存在返回0
  if [ -z "${pid}" ]; then
    return 1
  else
    if netstat -tlnp | grep "${pid}" >/dev/null; then
      if ! curl http://"${localIp}":8888 &>/dev/null; then return 2; fi # 存在端口但不提供服务
      return 0
    else
      return 2 # 存在pid不存在port,可能正在启动,也可能启动失败
    fi
  fi
}

# 服务自愈,可配合cron定时任务
function self_healing() {
  local step=5
  local res=1
  local bool=1
  for ((i = 0; i < 60; i = (i + step))); do
    serviceIsAlive
    res=$?
    if [ ${res} = 1 ]; then
      logInfo "${JAR_NAME}服务开始启动!"
      nohup java -jar ${AGENT_INSTALL_HOME}/${JAR_NAME} >${AGENT_INSTALL_HOME}/nohup.out 2>&1 &
    elif [ ${res} = 2 ]; then
      logInfo "${JAR_NAME}服务启动中..."
      bool=1
    else
      logInfo "${JAR_NAME}服务已正常启动!"
      bool=0
    fi
    sleep $step
  done

  if [ ${bool} = 1 ]; then
    local pid=""
    pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}')
    kill -9 "${pid}"
    logInfo "${JAR_NAME}服务启动失败!i will kill it!!"
  fi
}

function uninstall() {
  stop
  rm -rf ${AGENT_INSTALL_HOME}
  # 考虑残留文件,再次判断删除
  if [ -e ${AGENT_INSTALL_HOME} ]; then rm -rf ${AGENT_INSTALL_HOME}; fi
  # 删除定时任务
  sed -i '/agentctl.sh/d' /etc/crontab
  source /etc/crontab
  # 删除exporter
  rm -rf /usr/lib/systemd/system/node_exporter.service
}

function check_status() {
  serviceIsAlive
}

case $1 in
start)
  start
  ;;
stop)
  stop
  ;;
restart)
  stop
  start
  ;;
install)
  install
  ;;
uninstall)
  uninstall
  ;;
check_status)
  check_status
  ;;
self_healing)
  self_healing
  ;;
*)
  logError "Usage: $0 {start|stop|install|uninstall|check_status|self_healing} {..}"
  ;;
esac
原文地址:https://www.cnblogs.com/dhName/p/15334438.html