From cf00085d8045cddd80a8aabad97de96fa8131793 Mon Sep 17 00:00:00 2001 From: Chandru Date: Sat, 30 Aug 2008 00:28:16 +1000 Subject: powerpc: Add support for dynamic reconfiguration memory in kexec/kdump kernels Kdump kernel needs to use only those memory regions that it is allowed to use (crashkernel, rtas, tce, etc.). Each of these regions have their own sizes and are currently added under 'linux,usable-memory' property under each memory@xxx node of the device tree. The ibm,dynamic-memory property of ibm,dynamic-reconfiguration-memory node (on POWER6) now stores in it the representation for most of the logical memory blocks with the size of each memory block being a constant (lmb_size). If one or more or part of the above mentioned regions lie under one of the lmb from ibm,dynamic-memory property, there is a need to identify those regions within the given lmb. This makes the kernel recognize a new 'linux,drconf-usable-memory' property added by kexec-tools. Each entry in this property is of the form of a count followed by that many (base, size) pairs for the above mentioned regions. The number of cells in the count value is given by the #size-cells property of the root node. Signed-off-by: Chandru Siddalingappa Signed-off-by: Paul Mackerras --- arch/powerpc/mm/numa.c | 77 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/mm/numa.c') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index d9a18135133..be05457631d 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -150,6 +150,21 @@ static const int *of_get_associativity(struct device_node *dev) return of_get_property(dev, "ibm,associativity", NULL); } +/* + * Returns the property linux,drconf-usable-memory if + * it exists (the property exists only in kexec/kdump kernels, + * added by kexec-tools) + */ +static const u32 *of_get_usable_memory(struct device_node *memory) +{ + const u32 *prop; + u32 len; + prop = of_get_property(memory, "linux,drconf-usable-memory", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; + return prop; +} + /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa * info is found. */ @@ -486,15 +501,30 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start, return lmb_end_of_DRAM() - start; } +/* + * Reads the counter for a given entry in + * linux,drconf-usable-memory property + */ +static inline int __init read_usm_ranges(const u32 **usm) +{ + /* + * For each lmb in ibm,dynamic-memory a corresponding + * entry in linux,drconf-usable-memory property contains + * a counter followed by that many (base, size) duple. + * read the counter from linux,drconf-usable-memory + */ + return read_n_cells(n_mem_size_cells, usm); +} + /* * Extract NUMA information from the ibm,dynamic-reconfiguration-memory * node. This assumes n_mem_{addr,size}_cells have been set. */ static void __init parse_drconf_memory(struct device_node *memory) { - const u32 *dm; - unsigned int n, rc; - unsigned long lmb_size, size; + const u32 *dm, *usm; + unsigned int n, rc, ranges, is_kexec_kdump = 0; + unsigned long lmb_size, base, size, sz; int nid; struct assoc_arrays aa; @@ -510,6 +540,11 @@ static void __init parse_drconf_memory(struct device_node *memory) if (rc) return; + /* check if this is a kexec/kdump kernel */ + usm = of_get_usable_memory(memory); + if (usm != NULL) + is_kexec_kdump = 1; + for (; n != 0; --n) { struct of_drconf_cell drmem; @@ -521,21 +556,31 @@ static void __init parse_drconf_memory(struct device_node *memory) || !(drmem.flags & DRCONF_MEM_ASSIGNED)) continue; - nid = of_drconf_to_nid_single(&drmem, &aa); + base = drmem.base_addr; + size = lmb_size; + ranges = 1; - fake_numa_create_new_node( - ((drmem.base_addr + lmb_size) >> PAGE_SHIFT), + if (is_kexec_kdump) { + ranges = read_usm_ranges(&usm); + if (!ranges) /* there are no (base, size) duple */ + continue; + } + do { + if (is_kexec_kdump) { + base = read_n_cells(n_mem_addr_cells, &usm); + size = read_n_cells(n_mem_size_cells, &usm); + } + nid = of_drconf_to_nid_single(&drmem, &aa); + fake_numa_create_new_node( + ((base + size) >> PAGE_SHIFT), &nid); - - node_set_online(nid); - - size = numa_enforce_memory_limit(drmem.base_addr, lmb_size); - if (!size) - continue; - - add_active_range(nid, drmem.base_addr >> PAGE_SHIFT, - (drmem.base_addr >> PAGE_SHIFT) - + (size >> PAGE_SHIFT)); + node_set_online(nid); + sz = numa_enforce_memory_limit(base, size); + if (sz) + add_active_range(nid, base >> PAGE_SHIFT, + (base >> PAGE_SHIFT) + + (sz >> PAGE_SHIFT)); + } while (--ranges); } } -- cgit From 8f64e1f2d1e09267ac926e15090fd505c1c0cbcb Mon Sep 17 00:00:00 2001 From: Jon Tollefson Date: Thu, 9 Oct 2008 10:18:40 +0000 Subject: powerpc: Reserve in bootmem lmb reserved regions that cross NUMA nodes If there are multiple reserved memory blocks via lmb_reserve() that are contiguous addresses and on different NUMA nodes we are losing track of which address ranges to reserve in bootmem on which node. I discovered this when I recently got to try 16GB huge pages on a system with more then 2 nodes. When scanning the device tree in early boot we call lmb_reserve() with the addresses of the 16G pages that we find so that the memory doesn't get used for something else. For example the addresses for the pages could be 4000000000, 4400000000, 4800000000, 4C00000000, etc - 8 pages, one on each of eight nodes. In the lmb after all the pages have been reserved it will look something like the following: lmb_dump_all: memory.cnt = 0x2 memory.size = 0x3e80000000 memory.region[0x0].base = 0x0 .size = 0x1e80000000 memory.region[0x1].base = 0x4000000000 .size = 0x2000000000 reserved.cnt = 0x5 reserved.size = 0x3e80000000 reserved.region[0x0].base = 0x0 .size = 0x7b5000 reserved.region[0x1].base = 0x2a00000 .size = 0x78c000 reserved.region[0x2].base = 0x328c000 .size = 0x43000 reserved.region[0x3].base = 0xf4e8000 .size = 0xb18000 reserved.region[0x4].base = 0x4000000000 .size = 0x2000000000 The reserved.region[0x4] contains the 16G pages. In arch/powerpc/mm/num.c: do_init_bootmem() we loop through each of the node numbers looking for the reserved regions that belong to the particular node. It is not able to identify region 0x4 as being a part of each of the 8 nodes. It is assuming that a reserved region is only on a single node. This patch takes out the reserved region loop from inside the loop that goes over each node. It looks up the active region containing the start of the reserved region. If it extends past that active region then it adjusts the size and gets the next active region containing it. Signed-off-by: Jon Tollefson Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/numa.c | 108 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 28 deletions(-) (limited to 'arch/powerpc/mm/numa.c') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index be05457631d..6cf5c71c431 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -89,6 +89,46 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, return 0; } +/* + * get_active_region_work_fn - A helper function for get_node_active_region + * Returns datax set to the start_pfn and end_pfn if they contain + * the initial value of datax->start_pfn between them + * @start_pfn: start page(inclusive) of region to check + * @end_pfn: end page(exclusive) of region to check + * @datax: comes in with ->start_pfn set to value to search for and + * goes out with active range if it contains it + * Returns 1 if search value is in range else 0 + */ +static int __init get_active_region_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + struct node_active_region *data; + data = (struct node_active_region *)datax; + + if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) { + data->start_pfn = start_pfn; + data->end_pfn = end_pfn; + return 1; + } + return 0; + +} + +/* + * get_node_active_region - Return active region containing start_pfn + * @start_pfn: The page to return the region for. + * @node_ar: Returned set to the active region containing start_pfn + */ +static void __init get_node_active_region(unsigned long start_pfn, + struct node_active_region *node_ar) +{ + int nid = early_pfn_to_nid(start_pfn); + + node_ar->nid = nid; + node_ar->start_pfn = start_pfn; + work_with_active_regions(nid, get_active_region_work_fn, node_ar); +} + static void __cpuinit map_cpu_to_node(int cpu, int node) { numa_cpu_lookup_table[cpu] = node; @@ -882,38 +922,50 @@ void __init do_init_bootmem(void) start_pfn, end_pfn); free_bootmem_with_active_regions(nid, end_pfn); + } - /* Mark reserved regions on this node */ - for (i = 0; i < lmb.reserved.cnt; i++) { - unsigned long physbase = lmb.reserved.region[i].base; - unsigned long size = lmb.reserved.region[i].size; - unsigned long start_paddr = start_pfn << PAGE_SHIFT; - unsigned long end_paddr = end_pfn << PAGE_SHIFT; - - if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid && - early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid) - continue; - - if (physbase < end_paddr && - (physbase+size) > start_paddr) { - /* overlaps */ - if (physbase < start_paddr) { - size -= start_paddr - physbase; - physbase = start_paddr; - } - - if (size > end_paddr - physbase) - size = end_paddr - physbase; - - dbg("reserve_bootmem %lx %lx\n", physbase, - size); - reserve_bootmem_node(NODE_DATA(nid), physbase, - size, BOOTMEM_DEFAULT); - } + /* Mark reserved regions */ + for (i = 0; i < lmb.reserved.cnt; i++) { + unsigned long physbase = lmb.reserved.region[i].base; + unsigned long size = lmb.reserved.region[i].size; + unsigned long start_pfn = physbase >> PAGE_SHIFT; + unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT); + struct node_active_region node_ar; + + get_node_active_region(start_pfn, &node_ar); + while (start_pfn < end_pfn) { + /* + * if reserved region extends past active region + * then trim size to active region + */ + if (end_pfn > node_ar.end_pfn) + size = (node_ar.end_pfn << PAGE_SHIFT) + - (start_pfn << PAGE_SHIFT); + dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, size, + node_ar.nid); + reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase, + size, BOOTMEM_DEFAULT); + /* + * if reserved region is contained in the active region + * then done. + */ + if (end_pfn <= node_ar.end_pfn) + break; + + /* + * reserved region extends past the active region + * get next active region that contains this + * reserved region + */ + start_pfn = node_ar.end_pfn; + physbase = start_pfn << PAGE_SHIFT; + get_node_active_region(start_pfn, &node_ar); } - sparse_memory_present_with_active_regions(nid); } + + for_each_online_node(nid) + sparse_memory_present_with_active_regions(nid); } void __init paging_init(void) -- cgit