x86-64: provide a memset() that can deal with 4Gb or above at a time Now that a corresponding change got accepted into Linux 3.4, let's fix this in our code too. It is particularly required by the memset() invoked from __alloc_bootmem_core(), which can be called with sizes beyond 4Gb out of alloc_node_mem_map() when CONFIG_FLAT_NODE_MEM_MAP is defined (starting at around 300Gb). In order to not affect the native kernel (which is unlikely to be affected anyway, as it usually sets up separate maps for each node [as long as NUMA is defined], and hence would require said amount of memory per node [and SPARSEMEM not to be used] for the problem to become visible, plus in this tree we're not really concerned about fixing native problems), introduce a Xen-specific clone of the original file. Signed-off-by: Jan Beulich --- /dev/null +++ b/arch/x86_64/lib/memset-xen.S @@ -0,0 +1,122 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs */ +/* + * ISO C memset - set a memory block to a byte value. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + .globl __memset + .globl memset + .p2align 4 +memset: +__memset: + movq %rdi,%r10 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + imulq %rcx,%rax + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment +.Lafter_bad_alignment: + + movq %rdx,%rcx + shrq $6,%rcx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decq %rcx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %edx,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + andl $7,%edx + jz .Lende + .p2align 4 +.Lloop_1: + decl %edx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + +.Lbad_alignment: + cmpq $7,%rdx + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%rdx + jmp .Lafter_bad_alignment + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstructions,"a" + .align 8 + .quad memset + .quad memset_c + .byte X86_FEATURE_REP_GOOD + .byte memset_c_end-memset_c + .byte memset_c_end-memset_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi value + * rdx count + */ +memset_c: + movq %rdi,%r9 + movq %rdx,%rcx + andl $7,%edx + shrq $3,%rcx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + imulq %rsi,%rax + rep + stosq + movl %edx,%ecx + rep + stosb + movq %r9,%rax + ret +memset_c_end: + .previous