diff --git a/agent/conf/agent.properties b/agent/conf/agent.properties index 0dc5b8211e0d..964ed1e7c2b0 100644 --- a/agent/conf/agent.properties +++ b/agent/conf/agent.properties @@ -310,6 +310,22 @@ iscsi.session.cleanup.enabled=false # This parameter specifies if the host must be rebooted when something goes wrong with the heartbeat. #reboot.host.and.alert.management.on.heartbeat.timeout=true +# Action taken by kvmheartbeat.sh / kvmspheartbeat.sh when a storage heartbeat +# write fails persistently. Supersedes the legacy binary +# 'reboot.host.and.alert.management.on.heartbeat.timeout' when set to a non-default value. +# +# Allowed values: +# reboot - immediate sysrq-trigger reboot (default; original behavior) +# graceful-reboot - 'systemctl reboot' instead of sysrq; allows VMs to stop cleanly +# restart-agent - restart cloudstack-agent only; running VMs are preserved +# log-only - log + alert; take no automatic action (admin must investigate) +# +# The 'graceful-reboot', 'restart-agent', and 'log-only' actions are recommended +# for setups using LINSTOR/DRBD or any local storage with replication, where +# transient I/O contention can cause a heartbeat write to time out without the +# host actually being unhealthy. +#kvm.heartbeat.fence.action=reboot + # Enables manually setting CPU's topology on KVM's VM. #enable.manually.setting.cpu.topology.on.kvm.vm=true diff --git a/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java b/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java index 3364f9708cf5..169f3d15834e 100644 --- a/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java +++ b/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java @@ -598,6 +598,25 @@ public class AgentProperties{ public static final Property REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT = new Property<>("reboot.host.and.alert.management.on.heartbeat.timeout", true); + /** + * Action taken by the KVM agent's storage heartbeat scripts (kvmheartbeat.sh / kvmspheartbeat.sh) + * when a heartbeat write fails persistently. Allowed values: + * + * The non-default values are recommended for setups using LINSTOR/DRBD or other replicated + * local storage, where transient I/O contention can cause a heartbeat write to time out + * without the host actually being unhealthy.
+ * Read by the heartbeat shell scripts directly from agent.properties.
+ * Data type: String.
+ * Default value: {@code reboot} + */ + public static final Property KVM_HEARTBEAT_FENCE_ACTION + = new Property<>("kvm.heartbeat.fence.action", "reboot"); + /** * Enables manually setting CPU's topology on KVM's VM.
* Data type: Boolean.
diff --git a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh index 9b7eadada69f..4f8056922656 100755 --- a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh +++ b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh @@ -156,11 +156,43 @@ then exit 0 elif [ "$cflag" == "1" ] then - /usr/bin/logger -t heartbeat "kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage." - sync & - sleep 5 - echo b > /proc/sysrq-trigger - exit $? + # Read fence action from agent.properties (default: reboot for backward compatibility). + # Allowed values: reboot | graceful-reboot | restart-agent | log-only + AGENT_PROPS="/etc/cloudstack/agent/agent.properties" + FENCE_ACTION="reboot" + if [ -r "$AGENT_PROPS" ]; then + val=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | tr -d '[:space:]') + [ -n "$val" ] && FENCE_ACTION="$val" + fi + + case "$FENCE_ACTION" in + log-only) + /usr/bin/logger -t heartbeat "kvmheartbeat.sh: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate." + exit 0 + ;; + restart-agent) + /usr/bin/logger -t heartbeat "kvmheartbeat.sh: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)." + sync & + sleep 2 + systemctl restart cloudstack-agent + exit $? + ;; + graceful-reboot) + /usr/bin/logger -t heartbeat "kvmheartbeat.sh: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)." + sync & + sleep 5 + systemctl reboot + exit $? + ;; + reboot|*) + # Original behavior: immediate kernel-level reboot via sysrq-trigger + /usr/bin/logger -t heartbeat "kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage." + sync & + sleep 5 + echo b > /proc/sysrq-trigger + exit $? + ;; + esac else write_hbLog exit $? diff --git a/scripts/vm/hypervisor/kvm/kvmspheartbeat.sh b/scripts/vm/hypervisor/kvm/kvmspheartbeat.sh index 3cb459e3e854..bc0294cfdfb7 100755 --- a/scripts/vm/hypervisor/kvm/kvmspheartbeat.sh +++ b/scripts/vm/hypervisor/kvm/kvmspheartbeat.sh @@ -58,9 +58,41 @@ deleteVMs() { if [ "$cflag" == "1" ] then - /usr/bin/logger -t heartbeat "kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage." - sync & - sleep 5 - echo b > /proc/sysrq-trigger - exit $? + # Read fence action from agent.properties (default: reboot for backward compatibility). + # Allowed values: reboot | graceful-reboot | restart-agent | log-only + AGENT_PROPS="/etc/cloudstack/agent/agent.properties" + FENCE_ACTION="reboot" + if [ -r "$AGENT_PROPS" ]; then + val=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | tr -d '[:space:]') + [ -n "$val" ] && FENCE_ACTION="$val" + fi + + case "$FENCE_ACTION" in + log-only) + /usr/bin/logger -t heartbeat "kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate." + exit 0 + ;; + restart-agent) + /usr/bin/logger -t heartbeat "kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)." + sync & + sleep 2 + systemctl restart cloudstack-agent + exit $? + ;; + graceful-reboot) + /usr/bin/logger -t heartbeat "kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)." + sync & + sleep 5 + systemctl reboot + exit $? + ;; + reboot|*) + # Original behavior: immediate kernel-level reboot via sysrq-trigger + /usr/bin/logger -t heartbeat "kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage." + sync & + sleep 5 + echo b > /proc/sysrq-trigger + exit $? + ;; + esac fi