[Date Prev][Date Next]
[Chronological]
[Thread]
[Top]
Re: No replication after power failure
On Wed, 2007-10-03 at 16:08 +0200, Pierangelo Masarati wrote:
> Stelios Grigoriadis wrote:
> > I am not sure this would be considered a bug, but it is a problem for
> > us. If the master goes down, the replicas have no way of detecting it.
> > When the master is going back up again, all replica servers have to be
> > restarted. Is there a way to avoid this?
> >
> > Using the KEEPALIVE option (socket or TCP) is not really an option since
> > the default timeout is 2 hours which is too long.
> >
> > Another would be to have some kind of timeout in the epoll and check if
> > the master is responding, but that timeout is used for the runqueue?
> >
> > Have you come across this? I was surprised to see that no one has had
> > any issues with it. Am I missing something?
>
> This was recently discussed (ITS#5133), and the only alternative to
> SO_KEEPALIVE would be to have some background thread poll the producer
> on the syncrepl descriptors on a regular basis performing some no-op
> (like searching the rootDSE requesting 1.1). Aaron Richton noted that
> support for SO_KEEPALIVE was added in OpenLDAP 2.3.28.
>
> p.
>
>
>
> Ing. Pierangelo Masarati
> OpenLDAP Core Team
>
> SysNet s.r.l.
> via Dossi, 8 - 27100 Pavia - ITALIA
> http://www.sys-net.it
> ---------------------------------------
> Office: +39 02 23998309
> Mobile: +39 333 4963172
> Email: pierangelo.masarati@sys-net.it
> ---------------------------------------
>
>
I have solved the problem by inserting a periodic check in the runqueue
(called do_mastercheck). The intervall is determined by a slapd.conf
parameter (mastercheckint) in the syncrepl section. The parameter is
optional. If it's not specified, it's not inserted in the runqueue. I
have tested the code and it seems to work.
The do_mastercheck function just does a dummy search against the master.
I'm supplying a patch (only syncrepl.c is affected) so you can hopefully
improve and incorporate the solution in the code.
/Stelios
--- servers/slapd/syncrepl.c 2007-10-05 15:17:32.000000000 +0200
+++ syncrepl.c 2007-10-05 15:17:38.000000000 +0200
@@ -78,6 +78,7 @@
int si_manageDSAit;
int si_slimit;
int si_tlimit;
+ int si_mastercheck_int;
int si_refreshDelete;
int si_refreshPresent;
int si_syncdata;
@@ -1017,6 +1018,35 @@
}
static void *
+do_mastercheck(
+ void *ctx,
+ void *arg )
+{
+ struct re_s* rtask = arg;
+ syncinfo_t *si = ( syncinfo_t * ) rtask->arg;
+ int rc;
+ char *search_attrs[] = { NULL };
+ int res;
+
+ if (si->si_ld) {
+ rc=ldap_search_ext_s(si->si_ld, "", LDAP_SCOPE_BASE, "(objectClass=*)", search_attrs, 0, NULL, NULL, NULL, 0, &res);
+ }
+
+ ldap_pvt_thread_mutex_lock( &slapd_rq.rq_mutex );
+
+ if ( ldap_pvt_runqueue_isrunning( &slapd_rq, rtask )) {
+ ldap_pvt_runqueue_stoptask( &slapd_rq, rtask );
+ }
+
+
+ rtask->interval.tv_sec = si->si_interval;
+ ldap_pvt_runqueue_resched( &slapd_rq, rtask, 0 );
+
+ ldap_pvt_thread_mutex_unlock( &slapd_rq.rq_mutex );
+
+}
+
+static void *
do_syncrepl(
void *ctx,
void *arg )
@@ -2772,6 +2802,7 @@
#define OLDAUTHCSTR "bindprincipal"
#define EXATTRSSTR "exattrs"
#define MANAGEDSAITSTR "manageDSAit"
+#define MASTERCHECKINTSTR "mastercheckint"
/* FIXME: unused */
#define LASTMODSTR "lastmod"
@@ -3201,6 +3232,17 @@
Debug( LDAP_DEBUG_ANY, "%s: %s.\n", c->log, c->msg, 0 );
return 1;
}
+ } else if ( !strncasecmp( c->argv[ i ], MASTERCHECKINTSTR "=",
+ STRLENOF( MASTERCHECKINTSTR "=" ) ) )
+ {
+ val = c->argv[ i ] + STRLENOF( MASTERCHECKINTSTR "=" );
+ if ( lutil_atoi( &si->si_mastercheck_int, val ) != 0 || si->si_mastercheck_int < 0 ) {
+ snprintf( c->msg, sizeof( c->msg ),
+ "invalid master check interval value \"%s\".\n",
+ val );
+ Debug( LDAP_DEBUG_ANY, "%s: %s.\n", c->log, c->msg, 0 );
+ return 1;
+ }
} else if ( !strncasecmp( c->argv[ i ], SYNCDATASTR "=",
STRLENOF( SYNCDATASTR "=" ) ) )
{
@@ -3276,6 +3318,7 @@
si->si_tlimit = 0;
si->si_slimit = 0;
si->si_conn_setup = 0;
+ si->si_mastercheck_int = 0;
si->si_presentlist = NULL;
LDAP_LIST_INIT( &si->si_nonpresentlist );
@@ -3304,6 +3347,7 @@
SLAP_DBFLAGS(c->be) |= SLAP_DBFLAG_NO_SCHEMA_CHECK;
}
c->be->be_syncinfo = si;
+
return 0;
}
}
@@ -3438,6 +3482,22 @@
ber_dupbv( bv, &bc );
}
+static int add_mastercheck( ConfigArgs *c ) {
+ int rc;
+ syncinfo_t *si = c->be->be_syncinfo;
+
+ if ( si->si_mastercheck_int == 0 )
+ return 0;
+
+ rc = ldap_pvt_runqueue_insert( &slapd_rq, si->si_mastercheck_int * 60,
+ do_mastercheck, si, "do_mastercheck", c->be->be_suffix[0].bv_val );
+ printf("Mastercheck int: %d\n", c->be->be_syncinfo->si_mastercheck_int);
+ if (rc < 0)
+ Debug( LDAP_DEBUG_ANY, "failed to add syncinfo\n", 0, 0, 0 );
+
+ return rc;
+}
+
int
syncrepl_config( ConfigArgs *c )
{
@@ -3473,5 +3533,7 @@
} else if ( add_syncrepl( c ) ) {
return(1);
}
+
+ add_mastercheck(c);
return config_sync_shadow( c );
}