mirror of https://github.com/lework/script
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
339 lines
10 KiB
339 lines
10 KiB
2 years ago
|
#!/usr/bin/env bash
|
||
|
|
||
|
set -x
|
||
|
current_dir=$(pwd)
|
||
|
timestamp=$(date +%s)
|
||
|
diagnose_dir=/tmp/diagnose_${timestamp}
|
||
|
mkdir -p $diagnose_dir
|
||
|
is_ps_hang=false
|
||
|
|
||
|
run() {
|
||
|
echo
|
||
|
echo "-----------------run $@------------------"
|
||
|
timeout 10s $@
|
||
|
if [ "$?" != "0" ]; then
|
||
|
echo "failed to collect info: $@"
|
||
|
fi
|
||
|
echo "------------End of ${1}----------------"
|
||
|
}
|
||
|
|
||
|
os_env()
|
||
|
{
|
||
|
grep -q "Ubuntu" /etc/os-release && export OS="Ubuntu" && return
|
||
|
grep -q "SUSE" /etc/os-release && export OS="SUSE" && return
|
||
|
grep -q "Red Hat" /etc/os-release && export OS="RedHat" && return
|
||
|
grep -q "CentOS Linux" /etc/os-release && export OS="CentOS" && return
|
||
|
grep -q "Kylin Linux" /etc/os-release && export OS="CentOS" && return
|
||
|
grep -q "Aliyun Linux" /etc/os-release && export OS="AliyunOS" && return
|
||
|
grep -q "Alibaba Group Enterprise Linux" /etc/os-release && export OS="AliOS" && return
|
||
|
|
||
|
echo "unknown os... exit."
|
||
|
exit 1
|
||
|
}
|
||
|
|
||
|
dist() {
|
||
|
cat /etc/issue*
|
||
|
}
|
||
|
|
||
|
command_exists() {
|
||
|
command -v "$@" > /dev/null 2>&1
|
||
|
}
|
||
|
|
||
|
# Service status
|
||
|
service_status() {
|
||
|
run service firewalld status | tee $diagnose_dir/service_status
|
||
|
run service ntpd status | tee $diagnose_dir/service_status
|
||
|
run service chronyd status | tee $diagnose_dir/service_status
|
||
|
}
|
||
|
|
||
|
|
||
|
#system info
|
||
|
|
||
|
system_info() {
|
||
|
# mkdir -p ${diagnose_dir}/system_info
|
||
|
run uname -a | tee -a ${diagnose_dir}/system_info
|
||
|
run uname -r | tee -a ${diagnose_dir}/system_info
|
||
|
run dist | tee -a ${diagnose_dir}/system_info
|
||
|
if command_exists lsb_release; then
|
||
|
run lsb_release | tee -a ${diagnose_dir}/system_info
|
||
|
fi
|
||
|
run ulimit -a | tee -a ${diagnose_dir}/system_info
|
||
|
run sysctl -a | tee -a ${diagnose_dir}/system_info
|
||
|
}
|
||
|
|
||
|
#network
|
||
|
network_info() {
|
||
|
# mkdir -p ${diagnose_dir}/network_info
|
||
|
#run ifconfig
|
||
|
run ip --details ad show | tee -a ${diagnose_dir}/network_info
|
||
|
run ip --details link show | tee -a ${diagnose_dir}/network_info
|
||
|
run ip route show | tee -a ${diagnose_dir}/network_info
|
||
|
run iptables-save | tee -a ${diagnose_dir}/network_info
|
||
|
netstat -nt | tee -a ${diagnose_dir}/network_info
|
||
|
netstat -nu | tee -a ${diagnose_dir}/network_info
|
||
|
netstat -ln | tee -a ${diagnose_dir}/network_info
|
||
|
}
|
||
|
|
||
|
|
||
|
# check ps -ef command is hung
|
||
|
check_ps_hang() {
|
||
|
echo "check if ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status
|
||
|
checkD=$(timeout -s 9 2 ps -ef)
|
||
|
if [ "$?" != "0" ]; then
|
||
|
echo "ps -ef command is hung" | tee -a ${diagnose_dir}/ps_command_status
|
||
|
is_ps_hang=true
|
||
|
echo "start to check which process lead to ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status
|
||
|
for f in `find /proc/*/task -name status`
|
||
|
do
|
||
|
checkD=$(cat $f|grep "State.*D")
|
||
|
if [ "$?" == "0" ]; then
|
||
|
cmdline=$(echo ${f%%status}"cmdline")
|
||
|
pid=$(echo ${f%%status}"")
|
||
|
stack=$(echo ${f%%status}"stack")
|
||
|
timeout -s 9 2 cat $cmdline
|
||
|
if [ "$?" != "0" ]; then
|
||
|
echo "process $pid is in State D and lead to ps -ef process hang,stack info:" | tee -a ${diagnose_dir}/ps_command_status
|
||
|
cat $stack | tee -a ${diagnose_dir}/ps_command_status
|
||
|
fi
|
||
|
fi
|
||
|
done
|
||
|
echo "finish to check which process lead to ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status
|
||
|
else
|
||
|
echo "ps -ef command works fine" | tee -a ${diagnose_dir}/ps_command_status
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
|
||
|
#system status
|
||
|
system_status() {
|
||
|
#mkdir -p ${diagnose_dir}/system_status
|
||
|
run uptime | tee -a ${diagnose_dir}/system_status
|
||
|
run top -b -n 1 | tee -a ${diagnose_dir}/system_status
|
||
|
if [ "$is_ps_hang" == "false" ]; then
|
||
|
run ps -ef | tee -a ${diagnose_dir}/system_status
|
||
|
else
|
||
|
echo "ps -ef command hang, skip [ps -ef] check" | tee -a ${diagnose_dir}/system_status
|
||
|
fi
|
||
|
run netstat -nt | tee -a ${diagnose_dir}/system_status
|
||
|
run netstat -nu | tee -a ${diagnose_dir}/system_status
|
||
|
run netstat -ln | tee -a ${diagnose_dir}/system_status
|
||
|
|
||
|
run df -h | tee -a ${diagnose_dir}/system_status
|
||
|
|
||
|
run cat /proc/mounts | tee -a ${diagnose_dir}/system_status
|
||
|
|
||
|
if [ "$is_ps_hang" == "false" ]; then
|
||
|
run pstree -al | tee -a ${diagnose_dir}/system_status
|
||
|
else
|
||
|
echo "ps -ef command hang, skip [pstree -al] check" | tee -a ${diagnose_dir}/system_status
|
||
|
fi
|
||
|
|
||
|
run lsof | tee -a ${diagnose_dir}/system_status
|
||
|
|
||
|
(
|
||
|
cd /proc
|
||
|
find -maxdepth 1 -type d -name '[0-9]*' \
|
||
|
-exec bash -c "ls {}/fd/ | wc -l | tr '\n' ' '" \; \
|
||
|
-printf "fds (PID = %P), command: " \
|
||
|
-exec bash -c "tr '\0' ' ' < {}/cmdline" \; \
|
||
|
-exec echo \; | sort -rn | head | tee -a ${diagnose_dir}/system_status
|
||
|
)
|
||
|
}
|
||
|
|
||
|
|
||
|
daemon_status() {
|
||
|
run systemctl status docker -l | tee -a ${diagnose_dir}/docker_status
|
||
|
run systemctl status containerd -l | tee -a ${diagnose_dir}/containerd_status
|
||
|
run systemctl status container-storaged -l | tee -a ${diagnose_dir}/container-storaged_status
|
||
|
run systemctl status kubelet -l | tee -a ${diagnose_dir}/kubelet_status
|
||
|
}
|
||
|
|
||
|
docker_status() {
|
||
|
#mkdir -p ${diagnose_dir}/docker_status
|
||
|
echo "check dockerd process"
|
||
|
if [ "$is_ps_hang" == "false" ]; then
|
||
|
run ps -ef|grep -E 'dockerd|docker daemon'|grep -v grep| tee -a ${diagnose_dir}/docker_status
|
||
|
else
|
||
|
echo "ps -ef command hang, skip [ps -ef|grep -E 'dockerd|docker daemon'] check" | tee -a ${diagnose_dir}/docker_status
|
||
|
fi
|
||
|
|
||
|
#docker info
|
||
|
run docker info | tee -a ${diagnose_dir}/docker_status
|
||
|
run docker version | tee -a ${diagnose_dir}/docker_status
|
||
|
sudo kill -SIGUSR1 $(cat /var/run/docker.pid)
|
||
|
cp /var/run/docker/libcontainerd/containerd/events.log ${diagnose_dir}/containerd_events.log
|
||
|
sleep 10
|
||
|
cp /var/run/docker/*.log ${diagnose_dir}
|
||
|
|
||
|
}
|
||
|
|
||
|
showlog() {
|
||
|
local file=$1
|
||
|
if [ -f "$file" ]; then
|
||
|
tail -n 200 $file
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
#collect log
|
||
|
common_logs() {
|
||
|
log_tail_lines=10000
|
||
|
mkdir -p ${diagnose_dir}/logs
|
||
|
run dmesg -T | tail -n ${log_tail_lines} | tee ${diagnose_dir}/logs/dmesg.log
|
||
|
tail -c 500M /var/log/messages &> ${diagnose_dir}/logs/messages
|
||
|
pidof systemd && journalctl -n ${log_tail_lines} -u docker.service &> ${diagnose_dir}/logs/docker.log || tail -n ${log_tail_lines} /var/log/upstart/docker.log &> ${diagnose_dir}/logs/docker.log
|
||
|
}
|
||
|
|
||
|
archive() {
|
||
|
tar -zcvf ${current_dir}/diagnose_${timestamp}.tar.gz ${diagnose_dir}
|
||
|
echo "please get diagnose_${timestamp}.tar.gz for diagnostics"
|
||
|
}
|
||
|
|
||
|
varlogmessage(){
|
||
|
grep cloud-init /var/log/messages > $diagnose_dir/varlogmessage.log
|
||
|
}
|
||
|
|
||
|
cluster_dump(){
|
||
|
kubectl cluster-info dump > $diagnose_dir/cluster_dump.log
|
||
|
}
|
||
|
|
||
|
events(){
|
||
|
kubectl get events > $diagnose_dir/events.log
|
||
|
}
|
||
|
|
||
|
core_component() {
|
||
|
local comp="$1"
|
||
|
local label="$2"
|
||
|
mkdir -p $diagnose_dir/cs/$comp/
|
||
|
local pods=`kubectl get -n kube-system po -l $label=$comp | awk '{print $1}'|grep -v NAME`
|
||
|
for po in ${pods}
|
||
|
do
|
||
|
kubectl logs -n kube-system ${po} &> $diagnose_dir/cs/${comp}/${po}.log
|
||
|
done
|
||
|
}
|
||
|
|
||
|
etcd() {
|
||
|
journalctl -u etcd -xe &> $diagnose_dir/cs/etcd.log
|
||
|
}
|
||
|
|
||
|
storageplugins() {
|
||
|
mkdir -p ${diagnose_dir}/storage/
|
||
|
cp /var/log/alicloud/* ${diagnose_dir}/storage/
|
||
|
}
|
||
|
|
||
|
sandbox_runtime_status() {
|
||
|
if [[ ! -z $(pidof dockerd) || -z $(pidof containerd) ]]; then
|
||
|
return 0
|
||
|
fi
|
||
|
wget http://aliacs-k8s-cn-hangzhou.oss-cn-hangzhou.aliyuncs.com/public/diagnose/sandbox-runtime-status.tgz -q -O ${diagnose_dir}/sandbox-runtime-status.tgz
|
||
|
tar -xzvf ${diagnose_dir}/sandbox-runtime-status.tgz -C ${diagnose_dir}
|
||
|
pushd ${diagnose_dir}/sandbox-runtime-status
|
||
|
bash script_collect.sh >> $diagnose_dir/sandbox_runtime.status
|
||
|
popd
|
||
|
}
|
||
|
|
||
|
upload_oss() {
|
||
|
if [[ "$UPLOAD_OSS" == "" ]]; then
|
||
|
return 0
|
||
|
fi
|
||
|
|
||
|
bucket_path=${UPLOAD_OSS}
|
||
|
diagnose_file=diagnose_${timestamp}.tar.gz
|
||
|
|
||
|
if ! command_exists ossutil; then
|
||
|
curl -o /usr/local/bin/ossutil http://gosspublic.alicdn.com/ossutil/1.6.10/ossutil64
|
||
|
chmod u+x /usr/local/bin/ossutil
|
||
|
fi
|
||
|
|
||
|
|
||
|
region=$(curl http://100.100.100.200/latest/meta-data/region-id)
|
||
|
endpoint="oss-$region.aliyuncs.com"
|
||
|
if [[ "$ACCESS_KEY_ID" == "" ]]; then
|
||
|
roleName=$(curl 100.100.100.200/latest/meta-data/ram/security-credentials/)
|
||
|
echo "
|
||
|
[Credentials]
|
||
|
language = CH
|
||
|
endpoint = $endpoint
|
||
|
[AkService]
|
||
|
ecsAk=http://100.100.100.200/latest/meta-data/Ram/security-credentials/$roleName" > ./config
|
||
|
else
|
||
|
echo "
|
||
|
[Credentials]
|
||
|
language = CH
|
||
|
endpoint = $endpoint
|
||
|
accessKeyID = $ACCESS_KEY_ID
|
||
|
accessKeySecret = $ACCESS_KEY_SECRET
|
||
|
" > ./config
|
||
|
fi
|
||
|
bucket_name=${bucket_path%%/*}
|
||
|
oss_endpoint=$(ossutil stat oss://$bucket_name --config-file ./config | grep ExtranetEndpoint | awk '{print $3}')
|
||
|
if [[ "$oss_endpoint" != "" ]]; then
|
||
|
endpoint=$oss_endpoint
|
||
|
fi
|
||
|
ossutil cp ./${diagnose_file} oss://$bucket_path/$diagnose_file --config-file ./config --endpoint $endpoint
|
||
|
|
||
|
if [[ "$OSS_PUBLIC_LINK" != "" ]]; then
|
||
|
ossutil sign --timeout 7200 oss://$bucket_path/$diagnose_file --config-file ./config --endpoint $endpoint
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
parse_args() {
|
||
|
while
|
||
|
[[ $# -gt 0 ]]
|
||
|
do
|
||
|
key="$1"
|
||
|
|
||
|
case $key in
|
||
|
--oss)
|
||
|
export UPLOAD_OSS=$2
|
||
|
shift
|
||
|
;;
|
||
|
--oss-public-link)
|
||
|
export OSS_PUBLIC_LINK="true"
|
||
|
;;
|
||
|
--access-key-id)
|
||
|
export ACCESS_KEY_ID=$2
|
||
|
shift
|
||
|
;;
|
||
|
--access-key-secret)
|
||
|
export ACCESS_KEY_SECRET=$2
|
||
|
shift
|
||
|
;;
|
||
|
*)
|
||
|
echo "unknown option [$key]"
|
||
|
;;
|
||
|
esac
|
||
|
shift
|
||
|
done
|
||
|
}
|
||
|
|
||
|
pd_collect() {
|
||
|
os_env
|
||
|
system_info
|
||
|
service_status
|
||
|
network_info
|
||
|
check_ps_hang
|
||
|
system_status
|
||
|
docker_status
|
||
|
sandbox_runtime_status
|
||
|
common_logs
|
||
|
|
||
|
varlogmessage
|
||
|
core_component "cloud-controller-manager" "app"
|
||
|
core_component "kube-apiserver" "component"
|
||
|
core_component "kube-controller-manager" "component"
|
||
|
core_component "kube-scheduler" "component"
|
||
|
events
|
||
|
storageplugins
|
||
|
etcd
|
||
|
cluster_dump
|
||
|
archive
|
||
|
}
|
||
|
|
||
|
parse_args "$@"
|
||
|
|
||
|
pd_collect
|
||
|
|
||
|
upload_oss
|
||
|
|
||
|
echo "请上传 diagnose_${timestamp}.tar.gz"
|