script/shell/k8s/diagnose_k8s.sh

#!/usr/bin/env bash

set -x
current_dir=$(pwd)
timestamp=$(date +%s)
diagnose_dir=/tmp/diagnose_${timestamp}
mkdir -p $diagnose_dir
is_ps_hang=false

run() {
    echo
    echo "-----------------run $@------------------"
    timeout 10s $@
    if [ "$?" != "0" ]; then
        echo "failed to collect info: $@"
    fi
    echo "------------End of ${1}----------------"
}

os_env()
{
    grep -q "Ubuntu" /etc/os-release && export OS="Ubuntu" && return
    grep -q "SUSE" /etc/os-release && export OS="SUSE" && return
    grep -q "Red Hat" /etc/os-release && export OS="RedHat" && return
    grep -q "CentOS Linux" /etc/os-release && export OS="CentOS" && return
    grep -q "Kylin Linux" /etc/os-release && export OS="CentOS" && return
    grep -q "Aliyun Linux" /etc/os-release && export OS="AliyunOS" && return
    grep -q "Alibaba Group Enterprise Linux" /etc/os-release && export OS="AliOS" && return

    echo "unknown os...  exit."
    exit 1
}

dist() {
    cat /etc/issue*
}

command_exists() {
    command -v "$@" > /dev/null 2>&1
}

# Service status
service_status() {
    run service firewalld status | tee $diagnose_dir/service_status
    run service ntpd status | tee $diagnose_dir/service_status
    run service chronyd status | tee $diagnose_dir/service_status
}


#system info

system_info() {
    # mkdir -p ${diagnose_dir}/system_info
    run uname -a | tee -a ${diagnose_dir}/system_info
    run uname -r | tee -a ${diagnose_dir}/system_info
    run dist | tee -a ${diagnose_dir}/system_info
    if command_exists lsb_release; then
        run lsb_release | tee -a ${diagnose_dir}/system_info
    fi
    run ulimit -a | tee -a ${diagnose_dir}/system_info
    run sysctl -a | tee -a ${diagnose_dir}/system_info
}

#network
network_info() {
    # mkdir -p ${diagnose_dir}/network_info
    #run ifconfig
    run ip --details ad show | tee -a ${diagnose_dir}/network_info
    run ip --details link show | tee -a ${diagnose_dir}/network_info
    run ip route show | tee -a ${diagnose_dir}/network_info
    run iptables-save | tee -a ${diagnose_dir}/network_info
    netstat -nt | tee -a ${diagnose_dir}/network_info
    netstat -nu | tee -a ${diagnose_dir}/network_info
    netstat -ln | tee -a ${diagnose_dir}/network_info
}


# check ps -ef command is hung
check_ps_hang() {
  echo "check if ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status
  checkD=$(timeout -s 9 2 ps -ef)
  if [ "$?" != "0" ]; then
      echo "ps -ef command is hung" | tee -a ${diagnose_dir}/ps_command_status
      is_ps_hang=true
      echo "start to check which process lead to ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status
      for f in `find /proc/*/task -name status`
      do
          checkD=$(cat $f|grep "State.*D")
          if [ "$?" == "0" ]; then
              cmdline=$(echo ${f%%status}"cmdline")
              pid=$(echo ${f%%status}"")
              stack=$(echo ${f%%status}"stack")
              timeout -s 9 2 cat $cmdline
              if [ "$?" != "0" ]; then
                  echo "process $pid is in State D and lead to ps -ef process hang,stack info:" | tee -a ${diagnose_dir}/ps_command_status
                  cat $stack | tee -a ${diagnose_dir}/ps_command_status
              fi
          fi
      done
      echo "finish to check which process lead to ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status
  else
      echo "ps -ef command works fine" | tee -a ${diagnose_dir}/ps_command_status
  fi
}


#system status
system_status() {
    #mkdir -p ${diagnose_dir}/system_status
    run uptime | tee -a ${diagnose_dir}/system_status
    run top -b -n 1 | tee -a ${diagnose_dir}/system_status
    if [ "$is_ps_hang" == "false" ]; then
        run ps -ef | tee -a ${diagnose_dir}/system_status
    else
        echo "ps -ef command hang, skip [ps -ef] check" | tee -a ${diagnose_dir}/system_status
    fi
    run netstat -nt | tee -a ${diagnose_dir}/system_status
    run netstat -nu | tee -a ${diagnose_dir}/system_status
    run netstat -ln | tee -a ${diagnose_dir}/system_status

    run df -h | tee -a ${diagnose_dir}/system_status

    run cat /proc/mounts | tee -a ${diagnose_dir}/system_status

    if [ "$is_ps_hang" == "false" ]; then
        run pstree -al | tee -a ${diagnose_dir}/system_status
    else
        echo "ps -ef command hang, skip [pstree -al] check" | tee -a ${diagnose_dir}/system_status
    fi

    run lsof | tee -a ${diagnose_dir}/system_status

    (
        cd /proc
        find -maxdepth 1 -type d -name '[0-9]*' \
         -exec bash -c "ls {}/fd/ | wc -l | tr '\n' ' '" \; \
         -printf "fds (PID = %P), command: " \
         -exec bash -c "tr '\0' ' ' < {}/cmdline" \; \
         -exec echo \; | sort -rn | head | tee -a ${diagnose_dir}/system_status
    )
}


daemon_status() {
     run systemctl status docker -l | tee -a ${diagnose_dir}/docker_status
     run systemctl status containerd -l | tee -a ${diagnose_dir}/containerd_status
     run systemctl status container-storaged -l | tee -a ${diagnose_dir}/container-storaged_status
     run systemctl status kubelet -l | tee -a ${diagnose_dir}/kubelet_status
}

docker_status() {
    #mkdir -p ${diagnose_dir}/docker_status
    echo "check dockerd process"
    if [ "$is_ps_hang" == "false" ]; then
        run ps -ef|grep -E 'dockerd|docker daemon'|grep -v grep| tee -a ${diagnose_dir}/docker_status
    else
        echo "ps -ef command hang, skip [ps -ef|grep -E 'dockerd|docker daemon'] check" | tee -a ${diagnose_dir}/docker_status
    fi

    #docker info
    run docker info | tee -a ${diagnose_dir}/docker_status
    run docker version | tee -a ${diagnose_dir}/docker_status
    sudo kill -SIGUSR1 $(cat /var/run/docker.pid)
    cp /var/run/docker/libcontainerd/containerd/events.log ${diagnose_dir}/containerd_events.log
    sleep 10
    cp /var/run/docker/*.log ${diagnose_dir}

}

showlog() {
    local file=$1
    if [ -f "$file" ]; then
        tail -n 200 $file
    fi
}

#collect log
common_logs() {
    log_tail_lines=10000
    mkdir -p ${diagnose_dir}/logs
    run dmesg -T | tail -n ${log_tail_lines}  | tee ${diagnose_dir}/logs/dmesg.log
    tail -c 500M /var/log/messages &> ${diagnose_dir}/logs/messages
    pidof systemd && journalctl -n ${log_tail_lines} -u docker.service &> ${diagnose_dir}/logs/docker.log || tail -n ${log_tail_lines} /var/log/upstart/docker.log &> ${diagnose_dir}/logs/docker.log
}

archive() {
    tar -zcvf ${current_dir}/diagnose_${timestamp}.tar.gz ${diagnose_dir}
    echo "please get diagnose_${timestamp}.tar.gz for diagnostics"
}

varlogmessage(){
    grep cloud-init /var/log/messages > $diagnose_dir/varlogmessage.log
}

cluster_dump(){
    kubectl cluster-info dump > $diagnose_dir/cluster_dump.log
}

events(){
    kubectl get events > $diagnose_dir/events.log
}

core_component() {
    local comp="$1"
    local label="$2"
    mkdir -p $diagnose_dir/cs/$comp/
    local pods=`kubectl get -n kube-system po -l $label=$comp | awk '{print $1}'|grep -v NAME`
    for po in ${pods}
    do
        kubectl logs -n kube-system ${po} &> $diagnose_dir/cs/${comp}/${po}.log
    done
}

etcd() {
    journalctl -u etcd -xe &> $diagnose_dir/cs/etcd.log
}

storageplugins() {
    mkdir -p ${diagnose_dir}/storage/
    cp /var/log/alicloud/* ${diagnose_dir}/storage/
}

sandbox_runtime_status() {
    if [[ ! -z $(pidof dockerd) || -z $(pidof containerd) ]]; then
        return 0
    fi
    wget http://aliacs-k8s-cn-hangzhou.oss-cn-hangzhou.aliyuncs.com/public/diagnose/sandbox-runtime-status.tgz -q -O ${diagnose_dir}/sandbox-runtime-status.tgz
    tar -xzvf ${diagnose_dir}/sandbox-runtime-status.tgz -C ${diagnose_dir}
    pushd ${diagnose_dir}/sandbox-runtime-status
    bash script_collect.sh >> $diagnose_dir/sandbox_runtime.status
    popd
}

upload_oss() {
  if [[ "$UPLOAD_OSS" == "" ]]; then
      return 0
  fi

  bucket_path=${UPLOAD_OSS}
  diagnose_file=diagnose_${timestamp}.tar.gz

  if ! command_exists ossutil; then
    curl -o /usr/local/bin/ossutil http://gosspublic.alicdn.com/ossutil/1.6.10/ossutil64
    chmod u+x /usr/local/bin/ossutil
  fi


  region=$(curl http://100.100.100.200/latest/meta-data/region-id)
  endpoint="oss-$region.aliyuncs.com"
  if [[ "$ACCESS_KEY_ID" == "" ]]; then
    roleName=$(curl 100.100.100.200/latest/meta-data/ram/security-credentials/)
    echo "
[Credentials]
        language = CH
        endpoint = $endpoint
[AkService]
        ecsAk=http://100.100.100.200/latest/meta-data/Ram/security-credentials/$roleName" > ./config
  else
    echo "
[Credentials]
        language = CH
        endpoint = $endpoint
        accessKeyID = $ACCESS_KEY_ID
        accessKeySecret = $ACCESS_KEY_SECRET
" > ./config
  fi
  bucket_name=${bucket_path%%/*}
  oss_endpoint=$(ossutil stat oss://$bucket_name --config-file ./config | grep ExtranetEndpoint | awk '{print $3}')
  if [[ "$oss_endpoint" != "" ]]; then
    endpoint=$oss_endpoint
  fi
  ossutil cp ./${diagnose_file} oss://$bucket_path/$diagnose_file --config-file ./config --endpoint $endpoint

  if [[ "$OSS_PUBLIC_LINK" != "" ]]; then
    ossutil sign --timeout 7200 oss://$bucket_path/$diagnose_file --config-file ./config --endpoint $endpoint
  fi
}

parse_args() {
    while
        [[ $# -gt 0 ]]
    do
        key="$1"

        case $key in
        --oss)
            export UPLOAD_OSS=$2
            shift
            ;;
        --oss-public-link)
            export OSS_PUBLIC_LINK="true"
            ;;
        --access-key-id)
            export ACCESS_KEY_ID=$2
            shift
            ;;
        --access-key-secret)
            export ACCESS_KEY_SECRET=$2
            shift
            ;;
        *)
            echo "unknown option [$key]"
            ;;
        esac
        shift
    done
}

pd_collect() {
    os_env
    system_info
    service_status
    network_info
    check_ps_hang
    system_status
    docker_status
    sandbox_runtime_status
    common_logs

    varlogmessage
    core_component "cloud-controller-manager" "app"
    core_component "kube-apiserver" "component"
    core_component "kube-controller-manager" "component"
    core_component "kube-scheduler" "component"
    events
    storageplugins
    etcd
    cluster_dump
    archive
}

parse_args "$@"

pd_collect

upload_oss

echo "请上传 diagnose_${timestamp}.tar.gz"