From 093f607669f2a75a81ef749d97cea5e58f66c6ec Mon Sep 17 00:00:00 2001 From: William Brown Date: Wed, 13 Jan 2016 12:21:25 +1000 Subject: [PATCH] Ticket 48384 - Fix dblayer_is_cachesize_sane and dblayer_sys_pages for linux Bug Description: At this point in time, the current algorithm to determine if the cachesizing is sane is based on: issane = (int)((*cachesize / pagesize) <= (pages - procpages)); However, the values of pages and procpages are suspect: Consider: dblayer_is_cachesize_sane pages=3050679 / procpages=268505910 This isn't a type: procpages often exceeds pages. This is because procpages is derived from /proc/pid/status, vmsize, which is the maximum amount of ram a process *could* allocate. Additionally, the value of pages may exceed out vmsize, so we may be over eagerly allocating memory, that we don't actually have access to. Vmsize also includes swap space, so we might be trying to alloc memory into swap too. dblayer_is_cachesize_sane also only takes into account pages (total on system) and the current process' allocation: This makes no regard to: * How much ram is *actually* free on the system with respect to other processes * The value of getrlimit via availpages. The first condition is especially bad, because our system may be approaching an OOM condition, and we blazenly allocate a swathe of pages which triggers this. Fix Description: First, this fix corrects procpages to be based on vmrss, which is the actual working set size of the process, rather than the maximum possible allocation in vmsize. If CONSERVATIVE is defined (In this it defaults to true), the value of pages is taken from the smaller of: * vmsize * systeminfo total ram (excluding swap) The value of availpages is derived from the smallest of: * pages * getrlimit * freepages (with consideration of all processes on system) The check for issane now checks that the cachepage request is smaller than availpages, which guarantees: * Our system actually has the ram free to accomodate them without swapping or triggering an OOM condition. * We respect rlimits in the allocation. https://fedorahosted.org/389/ticket/48384 Author: wibrown Review by: ??? --- ldap/servers/slapd/back-ldbm/dblayer.c | 202 ++++++++++++++++++++++++++++----- 1 file changed, 173 insertions(+), 29 deletions(-) diff --git a/ldap/servers/slapd/back-ldbm/dblayer.c b/ldap/servers/slapd/back-ldbm/dblayer.c index e65f3cf..07e0095 100644 --- a/ldap/servers/slapd/back-ldbm/dblayer.c +++ b/ldap/servers/slapd/back-ldbm/dblayer.c @@ -216,6 +216,10 @@ static void dblayer_pop_pvt_txn(); #define TXN_TEST_INDEXES "TXN_TEST_INDEXES" /* list of indexes to use - comma delimited - id2entry,entryrdn,etc. */ #define TXN_TEST_VERBOSE "TXN_TEST_VERBOSE" /* be wordy */ +/* Define that we want to use conservative memory calculation values */ +/* Should be removed in the future! */ +#define CONSERVATIVE 1 + /* This function compares two index keys. It is assumed that the values are already normalized, since they should have been when the index was created (by int_values2keys). @@ -932,47 +936,136 @@ void dblayer_sys_pages(size_t *pagesize, size_t *pages, size_t *procpages, size_ #ifdef LINUX { + /* + * On linux because of the way that the virtual memory system works, we + * don't really need to think about other processes, or fighting them. + * But that's not without quirks. + * + * We are given a virtual memory space, represented by vsize (man 5 proc) + * This space is a "funny number". It's a best effort based system + * where linux instead of telling us how much memory *actually* exists + * for us to use, gives us a virtual memory allocation which is the + * value of ram + swap. + * + * But none of these pages even exist or belong to us on the real system + * until will malloc them AND write a non-zero to them. + * + * The biggest issue with this is that vsize does NOT consider the + * effect other processes have on the system. So a process can malloc + * 2 Gig from the host, and our vsize doesn't reflect that until we + * suddenly can't malloc anything. + * + * We can see exactly what we are using inside of the vmm by + * looking at rss (man 5 proc). This shows us the current actual + * allocation of memory we are using. This is a good thing. + * + * We obviously don't want to have any pages in swap, but sometimes we + * can't help that: And there is also no guarantee that while we have + * X bytes in vsize, that we can even allocate any of them. Plus, we + * don't know if we are about to allocate to swap or not .... or get us + * killed in a blaze of oom glory. + * + * So there are now two strategies avaliable in this function. + * The first is to blindly accept what the VMM tells us about vsize + * while we hope and pray that we don't get nailed because we used + * too much. + * + * The other is a more conservative approach: We check vsize from + * proc/pid/status, and we check sysinfo for MemAvailable. + * Which ever value is "lower" is the upper bound on pages we could + * potentially allocate: generally, this will be MemAvailable. + */ + struct sysinfo si; size_t pages_per_mem_unit = 0; size_t mem_units_per_page = 0; /* We don't know if these units are really pages */ + size_t vmsize = 0; + size_t freesize = 0; sysinfo(&si); *pagesize = getpagesize(); + if (si.mem_unit > *pagesize) { pages_per_mem_unit = si.mem_unit / *pagesize; *pages = si.totalram * pages_per_mem_unit; +#ifdef CONSERVATIVE + freesize = si.freeram * pages_per_mem_unit; +#endif } else { mem_units_per_page = *pagesize / si.mem_unit; *pages = si.totalram / mem_units_per_page; +#ifdef CONSERVATIVE + freesize = si.freeram / mem_units_per_page; +#endif } + + /* Get the amount of freeram, rss, and the vmsize */ + + FILE *f; + char fn[40], s[80]; + + sprintf(fn, "/proc/%d/status", getpid()); + f = fopen(fn, "r"); + if (!f) /* fopen failed */ + return; + while (! feof(f)) { + fgets(s, 79, f); + if (feof(f)) + break; + /* VmRSS shows us what we are ACTUALLY using for proc pages + * Rather than "funny" pages. + */ +#ifdef CONSERVATIVE + if (strncmp(s, "VmSize:", 7) == 0) { + sscanf(s+7, "%lu", (long unsigned int *)&vmsize); + } +#endif + if (strncmp(s, "VmRSS:", 6) == 0) { + sscanf(s+6, "%lu", (long unsigned int *)procpages); + } + } + fclose(f); + /* procpages is now in kb not pages... */ + *procpages /= (*pagesize / 1024); *availpages = dblayer_getvirtualmemsize() / *pagesize; - /* okay i take that back, linux's method is more retarded here. - * hopefully linux doesn't have the FILE* problem that solaris does - * (where you can't use FILE if you have more than 256 fd's open) + /* Now we have vmsize, the availpages from getrlimit, our freesize */ +#ifdef CONSERVATIVE + vmsize /= (*pagesize / 1024); + + /* Pages is the total ram on the system. We should smaller of: + * - vmsize + * - pages */ - if (procpages) { - FILE *f; - char fn[40], s[80]; - - sprintf(fn, "/proc/%d/status", getpid()); - f = fopen(fn, "r"); - if (!f) /* fopen failed */ - return; - while (! feof(f)) { - fgets(s, 79, f); - if (feof(f)) - break; - if (strncmp(s, "VmSize:", 7) == 0) { - sscanf(s+7, "%lu", (long unsigned int *)procpages); - break; - } - } - fclose(f); - /* procpages is now in 1k chunks, not pages... */ - *procpages /= (*pagesize / 1024); + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_sys_pages pages=%u, vmsize=%u, \n", *pages, vmsize,0); + if (vmsize < *pages) { + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_sys_pages using vmsize for pages \n",0,0,0); + *pages = vmsize; + } else { + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_sys_pages using pages for pages \n",0,0,0); } + + /* Availpages is how much we *could* alloc. We should take the smallest: + * - pages + * - getrlimit (availpages) + * - freesize + */ + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_sys_pages pages=%u, getrlim=%u, freesize=%u\n", *pages, *availpages, freesize); + if (*pages < *availpages && *pages < freesize) { + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_sys_pages using pages for availpages \n",0,0,0); + *availpages = *pages; + } else if ( freesize < *pages && freesize < *availpages ) { + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_sys_pages using freesize for availpages \n",0,0,0); + *availpages = freesize; + } else { + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_sys_pages using getrlim for availpages \n",0,0,0); + } + +#endif /* Conservative */ + } -#endif +#endif /* linux */ + + #if defined ( hpux ) { @@ -1030,26 +1123,77 @@ void dblayer_sys_pages(size_t *pagesize, size_t *pages, size_t *procpages, size_ } } #endif + +/* Should we make this log at a different level so it's easier to see + * the decision making process inside of DS? + */ +slapi_log_error(SLAPI_LOG_TRACE,"dblayer_sys_pages", "USING pages=%u, procpages=%u, availpages=%u \n", *pages, *procpages, *availpages); + } int dblayer_is_cachesize_sane(size_t *cachesize) { - size_t pages = 0, pagesize = 0, procpages = 0, availpages = 0; + size_t pages = 0; + size_t pagesize = 0; + size_t procpages = 0; + size_t availpages = 0; + + size_t cachepages = 0; + int issane = 1; dblayer_sys_pages(&pagesize, &pages, &procpages, &availpages); - if (!pagesize || !pages) - return 1; /* do nothing when we can't get the avail mem */ +#ifdef LINUX && CONSERVATIVE + if (!pagesize || !availpages) { + return 1; + } +#else + if (!pagesize || !pages) { + return 1; + } +#endif + /* do nothing when we can't get the avail mem */ + + /* If the requested cache size is larger than the remaining pysical memory * after the current working set size for this process has been subtracted, * then we say that's insane and try to correct. */ - issane = (int)((*cachesize / pagesize) <= (pages - procpages)); + + cachepages = *cachesize / pagesize; + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_is_cachesize_sane cachesize=%u / pagesize=%u \n",*cachesize,pagesize,0); + +#ifdef LINUX && CONSERVATIVE + issane = (int)(cachepages <= availpages); + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_is_cachesize_sane cachepages=%u <= availpages=%u\n",cachepages,availpages,0); + + if (!issane) { + /* Since we are ask for more than what's available, we give half of + * the remaining system mem to the cachesize instead, and log a warning + */ + *cachesize = (size_t)((availpages / 2) * pagesize); + slapi_log_error(SLAPI_LOG_FATAL, "dblayer_is_cachesize_sane", "WARNING adjusted cachesize to %u\n",*cachesize); + /* Since we adjusted the value, we should set this value to be sane?*/ + /* issane = 1 */ + } +#else + size_t freepages = 0; + freepages = pages - procpages; + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_is_cachesize_sane pages=%u - procpages=%u\n",pages,procpages,0); + + issane = (int)(cachepages <= freepages); + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_is_cachesize_sane cachepages=%u <= freepages=%u\n",cachepages,freepages,0); + if (!issane) { *cachesize = (size_t)((pages - procpages) * pagesize); + slapi_log_error(SLAPI_LOG_FATAL, "dblayer_is_cachesize_sane", "dblayer_is_cachesize_sane WARNING adjusted cachesize to %u\n",*cachesize); + /* Since we adjusted the value, we should set this value to be sane?*/ + /* issane = 1 */ } - +#endif + LDAPDebug(LDAP_DEBUG_ANY,"dblayer_is_cachesize_sane issane=%u \n",issane,0,0); + return issane; } -- 2.5.0