summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRainer Gerhards <rgerhards@adiscon.com>2013-04-07 18:48:48 +0200
committerRainer Gerhards <rgerhards@adiscon.com>2013-04-07 18:48:48 +0200
commit2de4a04b7e728f533382c9839345a89cbf73db89 (patch)
tree4664cba19491ea1d8c4a6283079c1694e1adeeb0
parent9b85b24d1323c91a06aeef08bbbde7a96afd46d6 (diff)
downloadrsyslog-2de4a04b7e728f533382c9839345a89cbf73db89.tar.gz
rsyslog-2de4a04b7e728f533382c9839345a89cbf73db89.tar.bz2
rsyslog-2de4a04b7e728f533382c9839345a89cbf73db89.zip
bugfix: failover/action suspend did not work correctly
This was experienced if the retry action took more than one second to complete. For suspending, a cached timestamp was used, and if the retry took longer, that timestamp was already in the past. As a result, the action never was kept in suspended state, and as such no failover happened. The suspend functionalit now does no longer use the cached timestamp (should not have any performance implication, as action suspend occurs very infrequently). Also added some better debug logging for the action engine.
-rw-r--r--ChangeLog8
-rw-r--r--action.c30
2 files changed, 28 insertions, 10 deletions
diff --git a/ChangeLog b/ChangeLog
index 854a14d3..c4546db6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -11,6 +11,14 @@ Version 7.2.7 [v7-stable] 2013-03-??
The rsyslog debug log file is now continued to be written across the
fork.
- updated systemd files to match current systemd source
+- bugfix: failover/action suspend did not work correctly
+ This was experienced if the retry action took more than one second
+ to complete. For suspending, a cached timestamp was used, and if the
+ retry took longer, that timestamp was already in the past. As a
+ result, the action never was kept in suspended state, and as such
+ no failover happened. The suspend functionalit now does no longer use
+ the cached timestamp (should not have any performance implication, as
+ action suspend occurs very infrequently).
- bugfix: nested if/prifilt conditions did not work properly
closes: http://bugzilla.adiscon.com/show_bug.cgi?id=415
- bugfix: script == comparison did not work properly on JSON objects
diff --git a/action.c b/action.c
index fd8727ee..8fc92e56 100644
--- a/action.c
+++ b/action.c
@@ -613,13 +613,17 @@ static void actionDisable(action_t *pThis)
* CPU time. TODO: maybe a config option for that?
* rgerhards, 2007-08-02
*/
-static inline void actionSuspend(action_t *pThis, time_t ttNow)
+static inline void actionSuspend(action_t *pThis)
{
- if(ttNow == NO_TIME_PROVIDED)
- datetime.GetTime(&ttNow);
+ time_t ttNow;
+
+ /* note: we can NOT use a cached timestamp, as time may have evolved
+ * since caching, and this would break logic (and it actually did so!)
+ */
+ datetime.GetTime(&ttNow);
pThis->ttResumeRtry = ttNow + pThis->iResumeInterval * (pThis->iNbrResRtry / 10 + 1);
actionSetState(pThis, ACT_STATE_SUSP);
- DBGPRINTF("earliest retry=%d\n", (int) pThis->ttResumeRtry);
+ DBGPRINTF("action suspended, earliest retry=%d\n", (int) pThis->ttResumeRtry);
}
@@ -639,7 +643,7 @@ static inline void actionSuspend(action_t *pThis, time_t ttNow)
* of its inability to recover. -- rgerhards, 2010-04-26.
*/
static inline rsRetVal
-actionDoRetry(action_t *pThis, time_t ttNow, int *pbShutdownImmediate)
+actionDoRetry(action_t *pThis, int *pbShutdownImmediate)
{
int iRetries;
int iSleepPeriod;
@@ -650,7 +654,9 @@ actionDoRetry(action_t *pThis, time_t ttNow, int *pbShutdownImmediate)
iRetries = 0;
while((*pbShutdownImmediate == 0) && pThis->eState == ACT_STATE_RTRY) {
+ DBGPRINTF("actionDoRetry: enter loop, iRetries=%d\n", iRetries);
iRet = pThis->pMod->tryResume(pThis->pModData);
+ DBGPRINTF("actionDoRetry: action->tryResume returned %d\n", iRet);
if((pThis->iResumeOKinRow > 9) && (pThis->iResumeOKinRow % 10 == 0)) {
bTreatOKasSusp = 1;
pThis->iResumeOKinRow = 0;
@@ -658,16 +664,18 @@ actionDoRetry(action_t *pThis, time_t ttNow, int *pbShutdownImmediate)
bTreatOKasSusp = 0;
}
if((iRet == RS_RET_OK) && (!bTreatOKasSusp)) {
+ DBGPRINTF("actionDoRetry: had success RDY again (iRet=%d)\n", iRet);
actionSetState(pThis, ACT_STATE_RDY);
} else if(iRet == RS_RET_SUSPENDED || bTreatOKasSusp) {
/* max retries reached? */
+ DBGPRINTF("actionDoRetry: check for max retries, iResumeRetryCount %d, iRetries %d\n",
+ pThis->iResumeRetryCount, iRetries);
if((pThis->iResumeRetryCount != -1 && iRetries >= pThis->iResumeRetryCount)) {
- actionSuspend(pThis, ttNow);
+ actionSuspend(pThis);
} else {
++pThis->iNbrResRtry;
++iRetries;
iSleepPeriod = pThis->iResumeInterval;
- ttNow += iSleepPeriod; /* not truly exact, but sufficiently... */
srSleep(iSleepPeriod, 0);
if(*pbShutdownImmediate) {
ABORT_FINALIZE(RS_RET_FORCE_TERM);
@@ -714,7 +722,7 @@ static rsRetVal actionTryResume(action_t *pThis, int *pbShutdownImmediate)
if(pThis->eState == ACT_STATE_RTRY) {
if(ttNow == NO_TIME_PROVIDED) /* use cached result if we have it */
datetime.GetTime(&ttNow);
- CHKiRet(actionDoRetry(pThis, ttNow, pbShutdownImmediate));
+ CHKiRet(actionDoRetry(pThis, pbShutdownImmediate));
}
if(Debug && (pThis->eState == ACT_STATE_RTRY ||pThis->eState == ACT_STATE_SUSP)) {
@@ -1125,6 +1133,7 @@ submitBatch(action_t *pAction, batch_t *pBatch, int nElem)
assert(pBatch != NULL);
+ DBGPRINTF("submitBatch: enter, nElem %d\n", nElem);
wasDoneTo = pBatch->iDoneUpTo;
bDone = 0;
do {
@@ -1146,7 +1155,8 @@ submitBatch(action_t *pAction, batch_t *pBatch, int nElem)
|| localRet == RS_RET_DEFER_COMMIT) {
bDone = 1;
} else if(localRet == RS_RET_SUSPENDED) {
- ; /* do nothing, this will retry the full batch */
+ DBGPRINTF("action ret RS_RET_SUSPENDED - retry full batch\n");
+ /* do nothing, this will retry the full batch */
} else if(localRet == RS_RET_ACTION_FAILED) {
/* in this case, everything not yet committed is BAD */
for(i = pBatch->iDoneUpTo ; i < wasDoneTo + nElem ; ++i) {
@@ -1974,7 +1984,7 @@ addAction(action_t **ppAction, modInfo_t *pMod, void *pModData,
pAction->eState = ACT_STATE_RDY; /* action is enabled */
if(bSuspended)
- actionSuspend(pAction, datetime.GetTime(NULL)); /* "good" time call, only during init and unavoidable */
+ actionSuspend(pAction);
CHKiRet(actionConstructFinalize(pAction, queueParams));