diff --git a/backport-db2-add-monitor_retries-monitor_sleep-and-monitor_re.patch b/backport-db2-add-monitor_retries-monitor_sleep-and-monitor_re.patch new file mode 100644 index 0000000000000000000000000000000000000000..3164261bea6d760afefb551e1777b3c8d5c8645e --- /dev/null +++ b/backport-db2-add-monitor_retries-monitor_sleep-and-monitor_re.patch @@ -0,0 +1,158 @@ +From ded016f84d3fb77dc0542e3f4226774526910d97 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 7 Aug 2025 13:55:11 +0200 +Subject: [PATCH] db2: add "monitor_retries", "monitor_sleep", and + "monitor_retry_all_errors" parameters to be able to avoid failing on first + try + +--- + heartbeat/db2 | 80 +++++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 72 insertions(+), 8 deletions(-) + +diff --git a/heartbeat/db2 b/heartbeat/db2 +index da6c9d5f..fe1d9b89 100755 +--- a/heartbeat/db2 ++++ b/heartbeat/db2 +@@ -41,11 +41,17 @@ + + OCF_RESKEY_instance_default="" + OCF_RESKEY_skip_basic_sql_health_check_default="false" ++OCF_RESKEY_monitor_retries_default="1" ++OCF_RESKEY_monitor_sleep_default="1" ++OCF_RESKEY_monitor_retry_all_errors_default="false" + OCF_RESKEY_admin_default="" + OCF_RESKEY_dbpartitionnum_default="0" + + : ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}} + : ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}} ++: ${OCF_RESKEY_monitor_retries=${OCF_RESKEY_monitor_retries_default}} ++: ${OCF_RESKEY_monitor_sleep=${OCF_RESKEY_monitor_sleep_default}} ++: ${OCF_RESKEY_monitor_retry_all_errors=${OCF_RESKEY_monitor_retry_all_errors_default}} + : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} + : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} + +@@ -108,11 +114,33 @@ Defaults to all databases in the instance. Specify one db for HADR mode. + + Skip basic health check SQL query. + +-Only set to "true" to avoid issues during high load. ++Only set to "true" when the "monitor_retries" and "monitor_retry_all_errors" parameters arent ++enough to avoid issues under high load. + + Skip basic health check SQL query + + ++ ++ ++Monitor retries before failing. ++ ++Monitor retries ++ ++ ++ ++ ++Monitor sleep between tries. ++ ++Monitor sleep ++ ++ ++ ++ ++Set to true to retry monitor-action for all errors instead of the default "db2pd" race conditions. ++ ++Retry monitor for all errors ++ ++ + + + DEPRECATED: The admin user of the instance. +@@ -666,6 +694,7 @@ db2_hadr_status() { + local output + + output=$(runasdb2 db2pd -hadr -db $db) ++ ocf_log debug "db2_hadr_status: $output" + if [ $? != 0 ] + then + echo "Down/Off" +@@ -676,7 +705,34 @@ db2_hadr_status() { + awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"} + /^\s+HADR_CONNECT_STATUS =/ {print $3; exit; } + /^HADR is not active/ {print "Standard/Standalone"; exit; } +- /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }' ++ /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; } ++ /^Option -hadr requires -db or -alldbs option and active database./ { exit 255 } ++ /^Another possibility of this failure is the Virtual Address Space Randomization is currently enabled on this system./ { exit 255 } ++ /^Changing data structure forced command termination./ { exit 255 }' ++} ++ ++db2_monitor_retry() { ++ local tries=$(($OCF_RESKEY_monitor_retries + 1)) ++ ++ for try in $(seq $tries); do ++ ocf_log debug "monitor try $try of $tries" ++ db2_monitor ++ rc=$? ++ [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ] && [ $rc -ne $OCF_NOT_RUNNING ] && ocf_log warn "Monitor failed with rc $rc." ++ if [ $rc -eq $OCF_SUCCESS ] || [ $rc -eq $OCF_RUNNING_MASTER ] || [ $rc -eq $OCF_NOT_RUNNING ] || { [ $rc -ne 255 ] && ! ocf_is_true "$OCF_RESKEY_monitor_retry_all_errors" ;} ;then ++ break ++ fi ++ [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_sleep ++ done ++ ++ [ $rc -eq 255 ] && rc=$OCF_ERR_GENERIC ++ ++ if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ]; then ++ # instance is dead remove master score ++ master_score -D -l reboot ++ fi ++ ++ return $rc + } + + # +@@ -690,9 +746,7 @@ db2_monitor() { + db2_instance_status + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then +- # instance is dead remove master score +- master_score -D -l reboot +- exit $rc ++ return $rc + fi + + [ $db2node = 0 ] || return 0 +@@ -700,8 +754,18 @@ db2_monitor() { + + for db in $dblist + do +- hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC ++ hadr=$(db2_hadr_status $db) ++ rc=$? + ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr" ++ if [ "$rc" -eq 255 ]; then ++ if [ "$__OCF_ACTION" = "monitor" ]; then ++ return $rc ++ else ++ return $OCF_ERR_GENERIC ++ fi ++ elif [ "$rc" -ne 0 ]; then ++ return $OCF_ERR_GENERIC ++ fi + + # set master preference accordingly + case "$hadr" in +@@ -915,9 +979,9 @@ case "$__OCF_ACTION" in + exit $? + ;; + +- monitor) ++ monitor) + db2_validate +- db2_monitor ++ db2_monitor_retry + exit $? + ;; + +-- +2.25.1 + diff --git a/backport-db2-add-skip_basic_sql_health_check-parameter-to-avo.patch b/backport-db2-add-skip_basic_sql_health_check-parameter-to-avo.patch new file mode 100644 index 0000000000000000000000000000000000000000..8d8ad232f77dfd35ee760bbed141295e32b49c6f --- /dev/null +++ b/backport-db2-add-skip_basic_sql_health_check-parameter-to-avo.patch @@ -0,0 +1,105 @@ +From fc240bdff60aae7133a532c7752c6253ce8f65ca Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Mon, 4 Aug 2025 16:53:09 +0200 +Subject: [PATCH] db2: add "skip_basic_sql_health_check" parameter to avoid + failing on systems with high load + +--- + heartbeat/db2 | 63 +++++++++++++++++++++++++++++++-------------------- + 1 file changed, 38 insertions(+), 25 deletions(-) + +diff --git a/heartbeat/db2 b/heartbeat/db2 +index 1cd66f15..da6c9d5f 100755 +--- a/heartbeat/db2 ++++ b/heartbeat/db2 +@@ -40,10 +40,12 @@ + # Parameter defaults + + OCF_RESKEY_instance_default="" ++OCF_RESKEY_skip_basic_sql_health_check_default="false" + OCF_RESKEY_admin_default="" + OCF_RESKEY_dbpartitionnum_default="0" + + : ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}} ++: ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}} + : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} + : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} + +@@ -102,6 +104,15 @@ Defaults to all databases in the instance. Specify one db for HADR mode. + List of databases to be managed + + ++ ++ ++Skip basic health check SQL query. ++ ++Only set to "true" to avoid issues during high load. ++ ++Skip basic health check SQL query ++ ++ + + + DEPRECATED: The admin user of the instance. +@@ -695,31 +706,33 @@ db2_monitor() { + # set master preference accordingly + case "$hadr" in + PRIMARY/*|Primary/*|Standard/*) +- # perform a basic health check +- CMD="if db2 connect to $db; +- then +- db2 select \* from sysibm.sysversions ; rc=\$?; +- db2 terminate; +- else +- rc=\$?; +- fi; +- exit \$rc" +- +- if ! output=$(runasdb2 $CMD) +- then +- case "$output" in +- SQL1776N*) +- # can't connect/select on standby, may be spurious turing takeover +- ;; +- +- *) +- ocf_log err "DB2 database $instance($db2node)/$db is not working" +- ocf_log err "DB2 message: $output" +- +- # dead primary, remove master score +- master_score -D -l reboot +- return $OCF_ERR_GENERIC +- esac ++ if ! ocf_is_true "$OCF_RESKEY_skip_basic_sql_health_check"; then ++ # perform a basic health check ++ CMD="if db2 connect to $db; ++ then ++ db2 select \* from sysibm.sysversions ; rc=\$?; ++ db2 terminate; ++ else ++ rc=\$?; ++ fi; ++ exit \$rc" ++ ++ if ! output=$(runasdb2 $CMD) ++ then ++ case "$output" in ++ SQL1776N*) ++ # can't connect/select on standby, may be spurious turing takeover ++ ;; ++ ++ *) ++ ocf_log err "DB2 database $instance($db2node)/$db is not working" ++ ocf_log err "DB2 message: $output" ++ ++ # dead primary, remove master score ++ master_score -D -l reboot ++ return $OCF_ERR_GENERIC ++ esac ++ fi + fi + + ocf_log debug "DB2 database $instance($db2node)/$db appears to be working" +-- +2.25.1 + diff --git a/resource-agents.spec b/resource-agents.spec index e75a3527efe0c73d691ff6c5e8fee638975e55e3..54e8284aec762af932fe2342ca5b5ca1d1c9dec9 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -1,7 +1,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.16.0 -Release: 9 +Release: 10 License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: https://github.com/ClusterLabs/resource-agents/releases/tag/v%{version}.tar.gz @@ -21,6 +21,8 @@ Patch0012: backport-findif.sh-fix-to-avoid-duplicate-route-issues.patc Patch0013: backport-mariadb-add-SSL-TLS-Support-2045.patch Patch0014: backport-ocf-shellfuncs-set-SHELL-to-default-shell-if-it-s-se.patch Patch0015: backport-ocf-shellfuncs-remove-extra-sleep-from-curl_retry-20.patch +Patch0016: backport-db2-add-skip_basic_sql_health_check-parameter-to-avo.patch +Patch0017: backport-db2-add-monitor_retries-monitor_sleep-and-monitor_re.patch Obsoletes: heartbeat-resources <= %{version} Provides: heartbeat-resources = %{version} @@ -119,6 +121,10 @@ export CFLAGS="$(echo '%{optflags}')" %{_mandir}/man8/{ocf-tester.8*,ldirectord.8*} %changelog +* Wed Oct 29 2025 bizhiyuan - 4.16.0-10 +- db2: add "monitor_retries", "monitor_sleep", and"monitor_retry_all_errors" parameters to be able to avoid failing on firsttry +- db2: add "skip_basic_sql_health_check" parameter to avoidfailing on systems with high load + * Fri Oct 24 2025 bizhiyuan - 4.16.0-9 - ocf-shellfuncs: set SHELL to default shell if it's set tonologin - ocf-shellfuncs: remove extra sleep from curl_retry (#2058)