[Date Prev][Date Next] [Chronological] [Thread] [Top]

BerkeleyDB performance on Linux



The attached patch makes O_DIRECT work on Linux in BerkeleyDB 4.5.20. (You will need to manually define LINUX_NEEDS_PAGE_ALIGNMENT if you're using a kernel older than 2.6.)

The main reason to use this patch is to conserve memory - ordinarily, all the I/O that BDB does to its files gets cached in the Linux filesystem buffer cache. This caching is redundant since BDB always does its own caching, and it effectively makes the BDB environment consume twice as much memory as it needs. Using O_DIRECT on I/Os disables the filesystem buffer cache for those I/Os, thus freeing up a sizable chunk of memory.

The caching problem is particularly aggravated on Linux because the memory manager doesn't give program pages higher priority than cache pages. So when your system is tight on memory, the kernel will start swapping program data pages before it starts reclaiming buffer cache pages, and application performance plummets. (Possibly that indicates a kernel bug, or at least a misfeature.)

Note that you must configure BerkeleyDB with --enable-o_direct to enable the support, and you must add "set_flags DB_DIRECT_DB" to your DB_CONFIG to enable it in a particular environment.

With this patch, a slapd that occupies 6.8GB on a system with 8GB of RAM can run continuously without swapping, delivering a sustained 11,500 authentications per second. Without the patch, swapping starts when the process hits the 4.5GB mark (because over 3GB of buffer cache is in use), and performance drops to only *hundreds* of authentications per second.

--
  -- Howard Chu
  Chief Architect, Symas Corp.  http://www.symas.com
  Director, Highland Sun        http://highlandsun.com/hyc
  OpenLDAP Core Team            http://www.openldap.org/project/
--- dbinc/mp.h.orig	2006-09-07 14:31:58.000000000 -0700
+++ dbinc/mp.h	2007-01-06 19:14:56.000000000 -0800
@@ -378,6 +378,23 @@
 #define	BH_FREE_REUSE		0x02
 #define	BH_FREE_UNLOCKED	0x04
 
+#ifdef DIAG_MVCC
+#define	BH_ALIGNED
+#define VM_PAGESIZE 4096
+#endif
+
+/* Linux O_DIRECT needs aligned buffers. 2.6 kernel allows 512 byte
+ * alignment, otherwise need page sized (4096).
+ */
+#if defined(linux) && !defined(BH_ALIGNED)
+#define	BH_ALIGNED
+#ifdef LINUX_NEEDS_PAGE_ALIGNMENT
+#define VM_PAGESIZE 4096
+#else	/* Linux 2.6+ */
+#define VM_PAGESIZE 512
+#endif
+#endif
+
 /*
  * BH --
  *	Buffer header.
@@ -404,7 +421,7 @@
 
 	roff_t		td_off;		/* MVCC: creating TXN_DETAIL offset. */
 	SH_CHAIN_ENTRY	vc;		/* MVCC: version chain. */
-#ifdef DIAG_MVCC
+#ifdef BH_ALIGNED
 	u_int16_t	align_off;	/* Alignment offset for diagnostics.*/
 #endif
 
@@ -465,15 +482,14 @@
     (dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT) &&		\
     dbc->txn->td != NULL && __memp_skip_curadj(dbc, pgno))
 
-#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT)
-#define	VM_PAGESIZE 4096
-#define	MVCC_BHSIZE(mfp, sz) do {					\
+#ifdef BH_ALIGNED
+#define	BHSIZE(mfp, sz) do {					\
 	sz += VM_PAGESIZE + sizeof(BH);					\
 	if (mfp->stat.st_pagesize < VM_PAGESIZE)			\
 		sz += VM_PAGESIZE - mfp->stat.st_pagesize;		\
 } while (0)
 
-#define	MVCC_BHALIGN(mfp, p) do {					\
+#define	BHALIGN(mfp, p) do {					\
 	if (mfp != NULL) {						\
 		BH *__bhp;						\
 		void *__orig = (p);					\
@@ -493,13 +509,19 @@
 	}								\
 } while (0)
 
-#define	MVCC_BHUNALIGN(mfp, p) do {					\
+#define	BHUNALIGN(mfp, p) do {					\
 	if ((mfp) != NULL) {						\
 		BH *bhp = (BH *)(p);					\
 		(p) = ((u_int8_t *)bhp - bhp->align_off);		\
 	}								\
 } while (0)
+#else
+#define	BHSIZE(mfp, sz) do {} while (0)
+#define	BHALIGN(mfp, p) do {} while (0)
+#define	BHUNALIGN(mfp, p) do {} while (0)
+#endif
 
+#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT)
 #ifdef linux
 #define	MVCC_MPROTECT(buf, sz, mode) do {				\
 	int __ret = mprotect((buf), (sz), (mode));			\
@@ -513,11 +535,7 @@
 	}								\
 } while (0)
 #endif /* linux */
-
-#else /* defined(DIAG_MVCC) && defined(HAVE_MPROTECT) */
-#define	MVCC_BHSIZE(mfp, sz) do {} while (0)
-#define	MVCC_BHALIGN(mfp, p) do {} while (0)
-#define	MVCC_BHUNALIGN(mfp, p) do {} while (0)
+#else
 #define	MVCC_MPROTECT(buf, size, mode) do {} while (0)
 #endif
 
--- mp/mp_alloc.c.orig	2006-09-07 14:32:03.000000000 -0700
+++ mp/mp_alloc.c	2007-01-06 19:14:56.000000000 -0800
@@ -66,7 +66,7 @@
 	if (mfp != NULL) {
 		len = SSZA(BH, buf) + mfp->stat.st_pagesize;
 		/* Add space for alignment padding for MVCC diagnostics. */
-		MVCC_BHSIZE(mfp, len);
+		BHSIZE(mfp, len);
 	}
 
 	MPOOL_REGION_LOCK(dbenv, infop);
@@ -91,10 +91,10 @@
 			c_mp->stat.st_pages++;
 		MPOOL_REGION_UNLOCK(dbenv, infop);
 		/*
-		 * For MVCC diagnostics, align the pointer so that the buffer
+		 * If necessary, align the pointer so that the buffer
 		 * starts on a page boundary.
 		 */
-		MVCC_BHALIGN(mfp, p);
+		BHALIGN(mfp, p);
 
 found:		if (offsetp != NULL)
 			*offsetp = R_OFFSET(infop, p);
@@ -447,7 +447,7 @@
 	MPOOLFILE *mfp;
 	void *buf;
 {
-	MVCC_BHUNALIGN(mfp, buf);
+	BHUNALIGN(mfp, buf);
 	COMPQUIET(mfp, NULL);
 	__db_shalloc_free(infop, buf);
 }
--- mp/mp_fget.c.orig	2006-09-13 09:22:42.000000000 -0700
+++ mp/mp_fget.c	2007-01-06 19:14:56.000000000 -0800
@@ -708,7 +708,7 @@
 		 * the hash bucket's priority.
 		 */
 		/*lint --e{668} (flexelint: bhp cannot be NULL). */
-#ifdef DIAG_MVCC
+#ifdef BH_ALIGNED
 		memset(bhp, 0, SSZ(BH, align_off));
 #else
 		memset(bhp, 0, sizeof(BH));