/* Copyright 2005 George Peter Staplin */ .global main .macro proc name .align 4 \name: .endm .macro save reg pushl \reg .endm .macro restore reg popl \reg .endm .equ EXIT_SUCCESS,0 .equ SIZE_OF_POINTER,4 .equ SIZE_OF_QUAD,16 .data .align 4 .equ big_src_size,4000 .comm big_src,big_src_size .comm big_dest,big_src_size .text double_fmt: .string "%G\n" eflags_fmt: .string "0x%x\n" reg_fmt: .string "reg 0x%x\n" byte_fmt: .string "0x%x\n" .macro print_reg reg pusha pushl \reg pushl $reg_fmt call printf addl $8,%esp popa .endm proc print_eflags pusha movl $0,%eax lahf pushl %eax pushl $eflags_fmt call printf addl $8,%esp popa ret /* This expects the double address to be in %eax */ proc print_double pusha pushl 4(%eax) #double part 2 pushl (%eax) #double part 1 pushl $double_fmt call printf addl $12,%esp popa ret /* This expects the address of a byte in %eax */ proc print_byte pusha movl $0,%ecx #unset all bits movb (%eax),%cl #just a byte into the lower part of the register pushl %ecx #push the value of the byte pushl $byte_fmt call printf addl $(SIZE_OF_POINTER * 2),%esp popa ret /* This seems to be for SSE3, and my processor doesn't support it */ .macro sse_lddqu .byte 0xf2 ; .byte 0x0f ; .byte 0xf0 ; .byte 0x00 .endm /* These are for assemblers without SSE support. These work with SSE2. */ .macro sse2_movdqu_from_eax .byte 0xf3 ; .byte 0x0f ; .byte 0x6f ; .byte 0x00 .endm .macro sse2_movdqu_to_eax .byte 0xf3 ; .byte 0x0f ; .byte 0x7f ; .byte 0x00 .endm .macro sse2_movdqu_from_edx .byte 0xf3 ; .byte 0x0f ; .byte 0x6f ; .byte 0x02 .endm .macro sse2_movdqu_to_edx .byte 0xf3 ; .byte 0x0f ; .byte 0x7f ; .byte 0x02 .endm .macro sse2_movdqu_from_esi .byte 0xf3 ; .byte 0x0f ; .byte 0x6f ; .byte 0x06 .endm .macro sse2_movdqu_to_edi .byte 0xf3 ; .byte 0x0f ; .byte 0x7f ; .byte 0x07 .endm /* Take dest,src,size on the stack*/ proc sse2_memcpy save %ecx save %edx save %esi save %edi movl (SIZE_OF_POINTER * 7)(%esp),%eax #size movl $0,%edx #remainder movl $SIZE_OF_QUAD,%ecx /* divide the size by SIZE_OF_QUAD */ divl %ecx /* %eax now has the number of 16-byte/quad chunks * %edx has the remainder */ movl (SIZE_OF_POINTER * 6)(%esp),%esi #src movl (SIZE_OF_POINTER * 5)(%esp),%edi #dest std #count down movl %eax,%ecx #number of quads cmpl $0,%ecx je 2f 1: /* We have at least 1 quad chunk */ movdqu (%esi),%xmm0 movdqu %xmm0,(%edi) addl $SIZE_OF_QUAD,%esi addl $SIZE_OF_QUAD,%edi loopnz 1b 2: #the size is smaller than 16 now /* Now handle the remainder that is already stored in %edx */ movl %edx,%ecx cmpl $0,%ecx je 4f 3: /* store a byte at a time into the destination */ movb (%esi),%al movb %al,(%edi) addl $1,%esi addl $1,%edi #print_reg %esi loopnz 3b 4: restore %edi restore %esi restore %edx restore %ecx ret instruction_start: movdqu (%esi),%xmm0 #movl $255,%ecx instruction_end: test_start: movdqu %xmm0,(%edi) test_end: .data before_sse2: .long 0 ; .long 0 after_sse2: .long 0 ; .long 0 final_ticks: .long 0 ; .long 0 .text emit_cpu_time_fmt: .string "cpu %llu\n" /* This expects the address of a .long .long in %eax */ proc print_ticks pusha pushl 4(%eax) pushl 0(%eax) pushl $emit_cpu_time_fmt call printf addl $(SIZE_OF_POINTER * 3),%esp popa ret /* This takes addr-of-result, after, before */ proc tick_diff save %ecx save %edx save %esi save %edi movl (SIZE_OF_POINTER * 6)(%esp),%eax #after movl 0(%eax),%ecx #low after movl 4(%eax),%edx #high after movl (SIZE_OF_POINTER * 7)(%esp),%eax #before movl 0(%eax),%esi #low before movl 4(%eax),%edi #high before /* %ecx/after = (after - before) */ subl %esi,%ecx sbb %edi,%edx /* At this point the registers %esi and %edi are free */ movl (SIZE_OF_POINTER * 5)(%esp),%eax #result .long .long movl %ecx,0(%eax) movl %edx,4(%eax) restore %edi restore %esi restore %edx restore %ecx ret proc main .if 1 std #count down movl $(instruction_end - instruction_start),%ecx movl $instruction_start,%eax 1: call print_byte addl $1,%eax loopnz 1b movl $(test_end - test_start),%ecx movl $test_start,%eax 1: call print_byte addl $1,%eax loopnz 1b .endif /* A canary to validate the range */ movl $big_src,%eax addl $(big_src_size - 1),%eax movl $0xff,(%eax) /* Save the current 64-bit ticks from the CPU */ movl $before_sse2,%ecx rdtsc movl %eax,(%ecx) movl %edx,4(%ecx) #movl $200,%ecx #try the sse2_memcpy 200 times #2: pushl $(big_src_size - 1) #size pushl $big_src #src pushl $big_dest #dest call sse2_memcpy addl $(SIZE_OF_POINTER * 3),%esp #loopnz 2b /* Save the latest 64-bit ticks from the CPU */ movl $after_sse2,%ecx rdtsc movl %eax,(%ecx) movl %edx,4(%ecx) /* Print the canary (should be 0xff) */ movl $big_src,%eax addl $(big_src_size - 1),%eax print_reg (%eax) movl $before_sse2,%eax call print_ticks movl $after_sse2,%eax call print_ticks pushl $before_sse2 pushl $after_sse2 pushl $final_ticks #the result of after - before call tick_diff addl $(SIZE_OF_POINTER * 3),%esp movl $final_ticks,%eax call print_ticks movl $EXIT_SUCCESS,%eax ret