OpenShift + vSphere + ACM Troubleshooting Runbook
OpenShift + vSphere + ACM Troubleshooting Runbook
1. SSH nodes
ssh core@<bootstrap-ip>sudo -issh core@<master-ip>sudo -i2. Bootstrap kubeconfig
export KUBECONFIG=/etc/kubernetes/kubeconfigoc get nodes -o wide3. Bootstrap checks
sudo crictl ps | grep etcdsystemctl status kubeletjournalctl -u kubelet -f4. API connectivity
curl -k https://api-int.<cluster>.<domain>:6443/healthznc -vz api-int.<cluster>.<domain> 64435. DNS
cat /etc/resolv.confgetent hosts api.<cluster>.<domain>dig +short api-int.<cluster>.<domain>6. Master kubeconfig
cd /etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigsexport KUBECONFIG=$(pwd)/lb-int.kubeconfigoc whoami7. Cluster status
oc get nodesoc get co8. CSI troubleshooting
oc get pods -n openshift-cluster-csi-driversoc logs -n openshift-cluster-csi-drivers <pod> -c csi-driveroc logs -n openshift-cluster-csi-drivers <pod> -c vsphere-syncer9. Identify failing containers
oc get pod -n openshift-cluster-csi-drivers <pod> -o jsonpath='{range .status.containerStatuses[*]}{.name}{" ready="}{.ready}{" waiting="}{.state.waiting.reason}{" terminated="}{.lastState.terminated.reason}{"\n"}{end}'10. Inspect CSI config
oc get secret -n openshift-cluster-csi-drivers vsphere-csi-config-secret -o jsonpath='{.data.cloud\.conf}' | base64 -d11. Inspect credentials
oc get secret -n openshift-cluster-csi-drivers vmware-vsphere-cloud-credentials -o yamloc get secret -n openshift-cluster-csi-drivers vmware-vsphere-cloud-credentials -o jsonpath='{.data.agsvcs001\.agositafinco\.it\.username}' | base64 -d12. Patch username
NEW=$(printf '%s' 'SRV-OCP-PROD@agositafinco.it' | base64 -w0)
oc patch secret -n openshift-cluster-csi-drivers vmware-vsphere-cloud-credentials --type=merge -p "{\"data\":{\"agsvcs001.agositafinco.it.username\":\"$NEW\"}}"13. Restart CSI
oc delete pod -n openshift-cluster-csi-drivers --allwatch -n2 oc get pods -n openshift-cluster-csi-drivers14. Verify storage operator
oc get co storage15. ACM Import
kubectl apply -f import.yaml16. ACM monitoring
oc get managedclusterwatch -n2 oc get managedcluster ocp01-prod17. Agents
oc get pods -n open-cluster-management-agentoc get pods -n open-cluster-management-agent-addon18. Cleanup failed import
oc delete klusterlet --all --ignore-not-foundoc delete ns open-cluster-management-agent --ignore-not-foundoc delete ns open-cluster-management-agent-addon --ignore-not-foundoc delete crd klusterlets.operator.open-cluster-management.io --ignore-not-foundoc delete crd klusterletaddonconfigs.agent.open-cluster-management.io --ignore-not-foundHub:
oc delete managedcluster ocp01-prod --ignore-not-foundoc delete ns ocp01-prod --ignore-not-foundoc delete klusterletaddonconfig ocp01-prod -n ocp01-prod --ignore-not-found19. Final checks
oc get nodesoc get co