mirror of https://github.com/lework/script
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
338 lines
10 KiB
338 lines
10 KiB
#!/usr/bin/env bash |
|
|
|
set -x |
|
current_dir=$(pwd) |
|
timestamp=$(date +%s) |
|
diagnose_dir=/tmp/diagnose_${timestamp} |
|
mkdir -p $diagnose_dir |
|
is_ps_hang=false |
|
|
|
run() { |
|
echo |
|
echo "-----------------run $@------------------" |
|
timeout 10s $@ |
|
if [ "$?" != "0" ]; then |
|
echo "failed to collect info: $@" |
|
fi |
|
echo "------------End of ${1}----------------" |
|
} |
|
|
|
os_env() |
|
{ |
|
grep -q "Ubuntu" /etc/os-release && export OS="Ubuntu" && return |
|
grep -q "SUSE" /etc/os-release && export OS="SUSE" && return |
|
grep -q "Red Hat" /etc/os-release && export OS="RedHat" && return |
|
grep -q "CentOS Linux" /etc/os-release && export OS="CentOS" && return |
|
grep -q "Kylin Linux" /etc/os-release && export OS="CentOS" && return |
|
grep -q "Aliyun Linux" /etc/os-release && export OS="AliyunOS" && return |
|
grep -q "Alibaba Group Enterprise Linux" /etc/os-release && export OS="AliOS" && return |
|
|
|
echo "unknown os... exit." |
|
exit 1 |
|
} |
|
|
|
dist() { |
|
cat /etc/issue* |
|
} |
|
|
|
command_exists() { |
|
command -v "$@" > /dev/null 2>&1 |
|
} |
|
|
|
# Service status |
|
service_status() { |
|
run service firewalld status | tee $diagnose_dir/service_status |
|
run service ntpd status | tee $diagnose_dir/service_status |
|
run service chronyd status | tee $diagnose_dir/service_status |
|
} |
|
|
|
|
|
#system info |
|
|
|
system_info() { |
|
# mkdir -p ${diagnose_dir}/system_info |
|
run uname -a | tee -a ${diagnose_dir}/system_info |
|
run uname -r | tee -a ${diagnose_dir}/system_info |
|
run dist | tee -a ${diagnose_dir}/system_info |
|
if command_exists lsb_release; then |
|
run lsb_release | tee -a ${diagnose_dir}/system_info |
|
fi |
|
run ulimit -a | tee -a ${diagnose_dir}/system_info |
|
run sysctl -a | tee -a ${diagnose_dir}/system_info |
|
} |
|
|
|
#network |
|
network_info() { |
|
# mkdir -p ${diagnose_dir}/network_info |
|
#run ifconfig |
|
run ip --details ad show | tee -a ${diagnose_dir}/network_info |
|
run ip --details link show | tee -a ${diagnose_dir}/network_info |
|
run ip route show | tee -a ${diagnose_dir}/network_info |
|
run iptables-save | tee -a ${diagnose_dir}/network_info |
|
netstat -nt | tee -a ${diagnose_dir}/network_info |
|
netstat -nu | tee -a ${diagnose_dir}/network_info |
|
netstat -ln | tee -a ${diagnose_dir}/network_info |
|
} |
|
|
|
|
|
# check ps -ef command is hung |
|
check_ps_hang() { |
|
echo "check if ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status |
|
checkD=$(timeout -s 9 2 ps -ef) |
|
if [ "$?" != "0" ]; then |
|
echo "ps -ef command is hung" | tee -a ${diagnose_dir}/ps_command_status |
|
is_ps_hang=true |
|
echo "start to check which process lead to ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status |
|
for f in `find /proc/*/task -name status` |
|
do |
|
checkD=$(cat $f|grep "State.*D") |
|
if [ "$?" == "0" ]; then |
|
cmdline=$(echo ${f%%status}"cmdline") |
|
pid=$(echo ${f%%status}"") |
|
stack=$(echo ${f%%status}"stack") |
|
timeout -s 9 2 cat $cmdline |
|
if [ "$?" != "0" ]; then |
|
echo "process $pid is in State D and lead to ps -ef process hang,stack info:" | tee -a ${diagnose_dir}/ps_command_status |
|
cat $stack | tee -a ${diagnose_dir}/ps_command_status |
|
fi |
|
fi |
|
done |
|
echo "finish to check which process lead to ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status |
|
else |
|
echo "ps -ef command works fine" | tee -a ${diagnose_dir}/ps_command_status |
|
fi |
|
} |
|
|
|
|
|
#system status |
|
system_status() { |
|
#mkdir -p ${diagnose_dir}/system_status |
|
run uptime | tee -a ${diagnose_dir}/system_status |
|
run top -b -n 1 | tee -a ${diagnose_dir}/system_status |
|
if [ "$is_ps_hang" == "false" ]; then |
|
run ps -ef | tee -a ${diagnose_dir}/system_status |
|
else |
|
echo "ps -ef command hang, skip [ps -ef] check" | tee -a ${diagnose_dir}/system_status |
|
fi |
|
run netstat -nt | tee -a ${diagnose_dir}/system_status |
|
run netstat -nu | tee -a ${diagnose_dir}/system_status |
|
run netstat -ln | tee -a ${diagnose_dir}/system_status |
|
|
|
run df -h | tee -a ${diagnose_dir}/system_status |
|
|
|
run cat /proc/mounts | tee -a ${diagnose_dir}/system_status |
|
|
|
if [ "$is_ps_hang" == "false" ]; then |
|
run pstree -al | tee -a ${diagnose_dir}/system_status |
|
else |
|
echo "ps -ef command hang, skip [pstree -al] check" | tee -a ${diagnose_dir}/system_status |
|
fi |
|
|
|
run lsof | tee -a ${diagnose_dir}/system_status |
|
|
|
( |
|
cd /proc |
|
find -maxdepth 1 -type d -name '[0-9]*' \ |
|
-exec bash -c "ls {}/fd/ | wc -l | tr '\n' ' '" \; \ |
|
-printf "fds (PID = %P), command: " \ |
|
-exec bash -c "tr '\0' ' ' < {}/cmdline" \; \ |
|
-exec echo \; | sort -rn | head | tee -a ${diagnose_dir}/system_status |
|
) |
|
} |
|
|
|
|
|
daemon_status() { |
|
run systemctl status docker -l | tee -a ${diagnose_dir}/docker_status |
|
run systemctl status containerd -l | tee -a ${diagnose_dir}/containerd_status |
|
run systemctl status container-storaged -l | tee -a ${diagnose_dir}/container-storaged_status |
|
run systemctl status kubelet -l | tee -a ${diagnose_dir}/kubelet_status |
|
} |
|
|
|
docker_status() { |
|
#mkdir -p ${diagnose_dir}/docker_status |
|
echo "check dockerd process" |
|
if [ "$is_ps_hang" == "false" ]; then |
|
run ps -ef|grep -E 'dockerd|docker daemon'|grep -v grep| tee -a ${diagnose_dir}/docker_status |
|
else |
|
echo "ps -ef command hang, skip [ps -ef|grep -E 'dockerd|docker daemon'] check" | tee -a ${diagnose_dir}/docker_status |
|
fi |
|
|
|
#docker info |
|
run docker info | tee -a ${diagnose_dir}/docker_status |
|
run docker version | tee -a ${diagnose_dir}/docker_status |
|
sudo kill -SIGUSR1 $(cat /var/run/docker.pid) |
|
cp /var/run/docker/libcontainerd/containerd/events.log ${diagnose_dir}/containerd_events.log |
|
sleep 10 |
|
cp /var/run/docker/*.log ${diagnose_dir} |
|
|
|
} |
|
|
|
showlog() { |
|
local file=$1 |
|
if [ -f "$file" ]; then |
|
tail -n 200 $file |
|
fi |
|
} |
|
|
|
#collect log |
|
common_logs() { |
|
log_tail_lines=10000 |
|
mkdir -p ${diagnose_dir}/logs |
|
run dmesg -T | tail -n ${log_tail_lines} | tee ${diagnose_dir}/logs/dmesg.log |
|
tail -c 500M /var/log/messages &> ${diagnose_dir}/logs/messages |
|
pidof systemd && journalctl -n ${log_tail_lines} -u docker.service &> ${diagnose_dir}/logs/docker.log || tail -n ${log_tail_lines} /var/log/upstart/docker.log &> ${diagnose_dir}/logs/docker.log |
|
} |
|
|
|
archive() { |
|
tar -zcvf ${current_dir}/diagnose_${timestamp}.tar.gz ${diagnose_dir} |
|
echo "please get diagnose_${timestamp}.tar.gz for diagnostics" |
|
} |
|
|
|
varlogmessage(){ |
|
grep cloud-init /var/log/messages > $diagnose_dir/varlogmessage.log |
|
} |
|
|
|
cluster_dump(){ |
|
kubectl cluster-info dump > $diagnose_dir/cluster_dump.log |
|
} |
|
|
|
events(){ |
|
kubectl get events > $diagnose_dir/events.log |
|
} |
|
|
|
core_component() { |
|
local comp="$1" |
|
local label="$2" |
|
mkdir -p $diagnose_dir/cs/$comp/ |
|
local pods=`kubectl get -n kube-system po -l $label=$comp | awk '{print $1}'|grep -v NAME` |
|
for po in ${pods} |
|
do |
|
kubectl logs -n kube-system ${po} &> $diagnose_dir/cs/${comp}/${po}.log |
|
done |
|
} |
|
|
|
etcd() { |
|
journalctl -u etcd -xe &> $diagnose_dir/cs/etcd.log |
|
} |
|
|
|
storageplugins() { |
|
mkdir -p ${diagnose_dir}/storage/ |
|
cp /var/log/alicloud/* ${diagnose_dir}/storage/ |
|
} |
|
|
|
sandbox_runtime_status() { |
|
if [[ ! -z $(pidof dockerd) || -z $(pidof containerd) ]]; then |
|
return 0 |
|
fi |
|
wget http://aliacs-k8s-cn-hangzhou.oss-cn-hangzhou.aliyuncs.com/public/diagnose/sandbox-runtime-status.tgz -q -O ${diagnose_dir}/sandbox-runtime-status.tgz |
|
tar -xzvf ${diagnose_dir}/sandbox-runtime-status.tgz -C ${diagnose_dir} |
|
pushd ${diagnose_dir}/sandbox-runtime-status |
|
bash script_collect.sh >> $diagnose_dir/sandbox_runtime.status |
|
popd |
|
} |
|
|
|
upload_oss() { |
|
if [[ "$UPLOAD_OSS" == "" ]]; then |
|
return 0 |
|
fi |
|
|
|
bucket_path=${UPLOAD_OSS} |
|
diagnose_file=diagnose_${timestamp}.tar.gz |
|
|
|
if ! command_exists ossutil; then |
|
curl -o /usr/local/bin/ossutil http://gosspublic.alicdn.com/ossutil/1.6.10/ossutil64 |
|
chmod u+x /usr/local/bin/ossutil |
|
fi |
|
|
|
|
|
region=$(curl http://100.100.100.200/latest/meta-data/region-id) |
|
endpoint="oss-$region.aliyuncs.com" |
|
if [[ "$ACCESS_KEY_ID" == "" ]]; then |
|
roleName=$(curl 100.100.100.200/latest/meta-data/ram/security-credentials/) |
|
echo " |
|
[Credentials] |
|
language = CH |
|
endpoint = $endpoint |
|
[AkService] |
|
ecsAk=http://100.100.100.200/latest/meta-data/Ram/security-credentials/$roleName" > ./config |
|
else |
|
echo " |
|
[Credentials] |
|
language = CH |
|
endpoint = $endpoint |
|
accessKeyID = $ACCESS_KEY_ID |
|
accessKeySecret = $ACCESS_KEY_SECRET |
|
" > ./config |
|
fi |
|
bucket_name=${bucket_path%%/*} |
|
oss_endpoint=$(ossutil stat oss://$bucket_name --config-file ./config | grep ExtranetEndpoint | awk '{print $3}') |
|
if [[ "$oss_endpoint" != "" ]]; then |
|
endpoint=$oss_endpoint |
|
fi |
|
ossutil cp ./${diagnose_file} oss://$bucket_path/$diagnose_file --config-file ./config --endpoint $endpoint |
|
|
|
if [[ "$OSS_PUBLIC_LINK" != "" ]]; then |
|
ossutil sign --timeout 7200 oss://$bucket_path/$diagnose_file --config-file ./config --endpoint $endpoint |
|
fi |
|
} |
|
|
|
parse_args() { |
|
while |
|
[[ $# -gt 0 ]] |
|
do |
|
key="$1" |
|
|
|
case $key in |
|
--oss) |
|
export UPLOAD_OSS=$2 |
|
shift |
|
;; |
|
--oss-public-link) |
|
export OSS_PUBLIC_LINK="true" |
|
;; |
|
--access-key-id) |
|
export ACCESS_KEY_ID=$2 |
|
shift |
|
;; |
|
--access-key-secret) |
|
export ACCESS_KEY_SECRET=$2 |
|
shift |
|
;; |
|
*) |
|
echo "unknown option [$key]" |
|
;; |
|
esac |
|
shift |
|
done |
|
} |
|
|
|
pd_collect() { |
|
os_env |
|
system_info |
|
service_status |
|
network_info |
|
check_ps_hang |
|
system_status |
|
docker_status |
|
sandbox_runtime_status |
|
common_logs |
|
|
|
varlogmessage |
|
core_component "cloud-controller-manager" "app" |
|
core_component "kube-apiserver" "component" |
|
core_component "kube-controller-manager" "component" |
|
core_component "kube-scheduler" "component" |
|
events |
|
storageplugins |
|
etcd |
|
cluster_dump |
|
archive |
|
} |
|
|
|
parse_args "$@" |
|
|
|
pd_collect |
|
|
|
upload_oss |
|
|
|
echo "请上传 diagnose_${timestamp}.tar.gz"
|
|
|