/* ______ ___ ___ * /\ _ \ /\_ \ /\_ \ * \ \ \L\ \\//\ \ \//\ \ __ __ _ __ ___ * \ \ __ \ \ \ \ \ \ \ /'__`\ /'_ `\/\`'__\/ __`\ * \ \ \/\ \ \_\ \_ \_\ \_/\ __//\ \L\ \ \ \//\ \L\ \ * \ \_\ \_\/\____\/\____\ \____\ \____ \ \_\\ \____/ * \/_/\/_/\/____/\/____/\/____/\/___L\ \/_/ \/___/ * /\____/ * \_/__/ * * 16 bit bitmap blitting (written for speed, not readability :-) * * By Shawn Hargreaves. * * MMX clear code by Robert Ohannessian. * * Blitting and masked blitting optimised by Jose Antonio Luque. * * See readme.txt for copyright information. */ #include "asmdefs.inc" #include "blit.inc" #ifdef ALLEGRO_COLOR16 .text /* void _linear_clear_to_color16(BITMAP *bitmap, int color); * Fills a linear bitmap with the specified color. */ FUNC(_linear_clear_to_color16) pushl %ebp movl %esp, %ebp pushl %edi pushl %esi pushl %ebx pushl %es movl ARG1, %edx /* edx = bmp */ movl BMP_CT(%edx), %ebx /* line to start at */ movl BMP_SEG(%edx), %eax /* select segment */ movl %eax, %es movl BMP_CR(%edx), %esi /* width to clear */ subl BMP_CL(%edx), %esi #ifdef ALLEGRO_MMX /* only use MMX if compiler supports it */ movl GLOBL(cpu_capabilities), %eax /* if MMX is enabled (or not disabled :) */ andl $CPU_MMX, %eax jz clear_no_mmx movl %esi, %eax /* If there are less than 16 pixels to clear, then we use the non-MMX version */ shrl $4, %eax orl %eax, %eax jz clear_no_mmx movl ARG2, %eax /* duplicate color twice */ movl %eax, %ecx shll $16, %eax andl $0xFFFF, %ecx orl %ecx, %eax pushl %eax movl %ds, %eax movl %es, %ecx cmpw %ax, %cx /* can we use nearptr ? */ jne clearMMXseg_loop /* if not, then we have to decode segments...*/ /* else, we save one cycle per 4 pixels on PMMX/K6 */ _align_ clearMMX_loop: movl %ebx, %eax movl BMP_CL(%edx), %edi WRITE_BANK() /* select bank */ leal (%eax, %edi, 2), %edi /* get line address */ popl %eax /* get eax back */ movl %esi, %ecx /* width to clear */ movd %eax, %mm0 /* restore mmx reg 0 in case it's been clobbered by WRITE_BANK() */ movd %eax, %mm1 psllq $32, %mm0 por %mm1, %mm0 pushl %eax /* save eax */ testl $7, %edi /* is destination aligned on 64-bit ? */ jz clearMMX_aligned clearMMX_do_alignment: movl %edi, %eax /* we want to adjust %ecx (pairing: see andl) */ movq %mm0, (%edi) /* we clear 4 pixels */ andl $7, %eax /* we calc how may pixels we actually wanted to clear (8 - %eax) (see subl) */ andl $0xFFFFFFF8, %edi /* instruction pairing (see inc %edi) */ shrl $1, %eax subl $4, %eax addl $8, %edi /* we set %edi to the next aligned memory address */ addl %eax, %ecx /* and adjust %ecx to reflect the change */ clearMMX_aligned: movl %ecx, %eax /* save for later */ shrl $4, %ecx /* divide by 16 for 4 * 8-byte memory move */ jz clearMMX_finish_line /* if there's less than 16 pixels to clear, no need for MMX */ clearMMX_continue_line: movq %mm0, (%edi) /* move 4x 8 bytes */ movq %mm0, 8(%edi) /* MMX instructions can't pair when both write to memory */ movq %mm0, 16(%edi) movq %mm0, 24(%edi) addl $32, %edi /* inserting those in the MMX copy block makes no diffrence */ decl %ecx jnz clearMMX_continue_line clearMMX_finish_line: movl %eax, %ecx /* get ecx back */ testl $15, %ecx /* check if there's any left */ jz clearMMX_no_long /* else, write trailing pixels */ testl $8, %ecx jz clearMMX_finish_line2 movq %mm0, (%edi) movq %mm0, 8(%edi) addl $16, %edi clearMMX_finish_line2: testl $4, %ecx jz clearMMX_finish_line3 movq %mm0, (%edi) addl $8, %edi clearMMX_finish_line3: andl $3, %ecx subl $4, %ecx shll $1, %ecx movq %mm0, (%edi, %ecx) clearMMX_no_long: incl %ebx cmpl %ebx, BMP_CB(%edx) jg clearMMX_loop /* and loop */ popl %eax emms /* clear FPU tag word */ jmp clear_done clearMMXseg_loop: movl %ebx, %eax movl BMP_CL(%edx), %edi WRITE_BANK() /* select bank */ leal (%eax, %edi, 2), %edi /* get line address */ popl %eax /* get eax back */ movl %esi, %ecx /* width to clear */ movd %eax, %mm0 /* restore mmx reg 0 in case it's been clobbered by WRITE_BANK() */ movd %eax, %mm1 psllq $32, %mm0 por %mm1, %mm0 pushl %eax /* save eax */ testl $7, %edi /* is destination aligned on 64-bit ? */ jz clearMMXseg_aligned clearMMXseg_do_alignment: movl %edi, %eax /* we want to adjust %ecx (pairing: see andl) */ movq %mm0, %es:(%edi) /* we clear 4 pixels */ andl $7, %eax /* we calc how may pixels we actually wanted to clear (8 - %eax) (see subl) */ andl $0xFFFFFFF8, %edi /* instruction pairing (see inc %edi) */ shrl $1, %eax subl $4, %eax addl $8, %edi /* we set %edi to the next aligned memory address */ addl %eax, %ecx /* and adjust %ecx to reflect the change */ clearMMXseg_aligned: movl %ecx, %eax /* save for later */ shrl $4, %ecx /* divide by 16 for 4 * 8-byte memory move */ jz clearMMXseg_finish_line /* if there's less than 16 pixels to clear, no need for MMX */ clearMMXseg_continue_line: movq %mm0, %es:(%edi) movq %mm0, %es:8(%edi) movq %mm0, %es:16(%edi) movq %mm0, %es:24(%edi) addl $32, %edi decl %ecx jnz clearMMXseg_continue_line clearMMXseg_finish_line: movl %eax, %ecx testl $15, %ecx /* check if there's any left */ jz clearMMXseg_no_long /* else, write trailing pixels */ testl $8, %ecx jz clearMMXseg_finish_line2 movq %mm0, %es:(%edi) movq %mm0, %es:8(%edi) addl $16, %edi clearMMXseg_finish_line2: testl $4, %ecx jz clearMMXseg_finish_line3 movq %mm0, %es:(%edi) addl $8, %edi clearMMXseg_finish_line3: andl $3, %ecx subl $4, %ecx shll $1, %ecx movq %mm0, %es:(%edi, %ecx) clearMMXseg_no_long: incl %ebx cmpl %ebx, BMP_CB(%edx) jg clearMMXseg_loop /* and loop */ popl %eax emms /* clear FPU tag word */ jmp clear_done #endif /* ALLEGRO_MMX */ clear_no_mmx: /* If no MMX is available, use the non-MMX version */ cld _align_ clear_loop: movl %ebx, %eax movl BMP_CL(%edx), %edi WRITE_BANK() /* select bank */ leal (%eax, %edi, 2), %edi /* get line address */ movw ARG2, %ax /* duplicate color twice */ shll $16, %eax movw ARG2, %ax movl %esi, %ecx /* width to clear */ shrl $1, %ecx /* halve for 32 bit clear */ jnc clear_no_word stosw /* clear an odd word */ clear_no_word: jz clear_no_long rep ; stosl /* clear the line */ clear_no_long: incl %ebx cmpl %ebx, BMP_CB(%edx) jg clear_loop /* and loop */ clear_done: popl %es UNWRITE_BANK() popl %ebx popl %esi popl %edi movl %ebp, %esp popl %ebp ret /* end of _linear_clear_to_color16() */ /* void _linear_blit16(BITMAP *source, BITMAP *dest, int source_x, source_y, * int dest_x, dest_y, int width, height); * Normal forwards blitting routine for linear bitmaps. */ FUNC(_linear_blit16) pushl %ebp movl %esp, %ebp pushl %edi pushl %esi pushl %ebx pushl %es movl B_DEST, %edx movl %ds, %ebx /* save data segment selector */ movl BMP_SEG(%edx), %eax /* load destination segment */ movl %eax, %es #ifdef ALLEGRO_MMX /* only use MMX if the compiler supports it */ movl GLOBL(cpu_capabilities), %eax /* if MMX is enabled (or not disabled :) */ andl $CPU_MMX, %eax jz blit_no_mmx shrl $1, B_WIDTH /* divide for use longs */ jz blit_only_one_word /* blit only one word */ jnc blit_longsmmx shrl $1, B_WIDTH /* divide for use longs64 */ jz blit_long_word /* blit one long and word */ jnc blit_even_wmmxlongs /* blit longs64 and word */ jmp blit_mmxlong_word /* blit longs64 and long and word */ blit_longsmmx: shrl $1, B_WIDTH /* divide for use longs64 */ jz blit_only_one_long /* blit only one long */ jnc blit_even_mmxlongs /* blit longs64 */ _align_ blit_mmxlong_long: /* blit longs64 and long */ #define BLIT_CODE \ even_llmmx_loop: \ movq %ds:(%esi), %mm0 ; \ addl $8, %esi ; \ movq %mm0, %es:(%edi) ; \ addl $8, %edi ; \ decl %ecx ; \ jnz even_llmmx_loop ; \ movsl BLIT_LOOP(long_longsmmx, 2, BLIT_CODE) #undef BLIT_CODE emms jmp blit_done _align_ blit_mmxlong_word: #define BLIT_CODE \ even_wlmmx_loop: \ movq %ds:(%esi), %mm0 ; \ addl $8, %esi ; \ movq %mm0, %es:(%edi) ; \ addl $8, %edi ; \ decl %ecx ; \ jnz even_wlmmx_loop ; \ movsl ; \ movsw BLIT_LOOP(word_longsmmx, 2, BLIT_CODE) #undef BLIT_CODE emms jmp blit_done _align_ blit_even_wmmxlongs: #define BLIT_CODE \ even_wmmx_loop: \ movq %ds:(%esi), %mm0 ; \ addl $8, %esi ; \ movq %mm0, %es:(%edi) ; \ addl $8, %edi ; \ decl %ecx ; \ jnz even_wmmx_loop ; \ movsw BLIT_LOOP(word_wlongsmmx, 2, BLIT_CODE) #undef BLIT_CODE emms jmp blit_done _align_ blit_even_mmxlongs: #define BLIT_CODE \ even_lmmx_loop: \ movq %ds:(%esi), %mm0 ; \ addl $8, %esi ; \ movq %mm0, %es:(%edi) ; \ addl $8, %edi ; \ decl %ecx ; \ jnz even_lmmx_loop BLIT_LOOP(even_longsmmx, 2, BLIT_CODE) #undef BLIT_CODE emms jmp blit_done _align_ blit_long_word: #define BLIT_CODE \ movsl ; \ movsw BLIT_LOOP(long_word, 2, BLIT_CODE) #undef BLIT_CODE emms jmp blit_done _align_ blit_only_one_long: BLIT_LOOP(only_one_wordmmx, 2, movsl) emms jmp blit_done #endif /* ALLEGRO_MMX */ blit_no_mmx: cld /* for forward copy */ shrl $1, B_WIDTH /* halve counter for long copies */ jz blit_only_one_word jnc blit_even_words _align_ #define BLIT_CODE \ rep ; movsl ; \ movsw BLIT_LOOP(longs_and_word, 2, BLIT_CODE) /* long at a time, plus leftover word */ #undef BLIT_CODE jmp blit_done _align_ blit_even_words: #define BLIT_CODE \ rep ; movsl BLIT_LOOP(even_words, 2, BLIT_CODE) /* copy a long at a time */ #undef BLIT_CODE jmp blit_done _align_ blit_only_one_word: BLIT_LOOP(only_one_word, 2, movsw) /* copy just the one word */ _align_ blit_done: popl %es movl B_SOURCE, %edx UNREAD_BANK() movl B_DEST, %edx UNWRITE_BANK() popl %ebx popl %esi popl %edi movl %ebp, %esp popl %ebp ret /* end of _linear_blit16() */ /* void _linear_blit_backward16(BITMAP *source, BITMAP *dest, int source_x, * int source_y, int dest_x, dest_y, int width, height); * Reverse blitting routine, for overlapping linear bitmaps. */ FUNC(_linear_blit_backward16) pushl %ebp movl %esp, %ebp pushl %edi pushl %esi pushl %ebx pushl %es movl B_HEIGHT, %eax /* y values go from high to low */ decl %eax addl %eax, B_SOURCE_Y addl %eax, B_DEST_Y movl B_WIDTH, %eax /* x values go from high to low */ decl %eax addl %eax, B_SOURCE_X addl %eax, B_DEST_X movl B_DEST, %edx movl %ds, %ebx /* save data segment selector */ movl BMP_SEG(%edx), %eax /* load destination segment */ movl %eax, %es _align_ blit_backwards_loop: movl B_DEST, %edx /* destination bitmap */ movl B_DEST_Y, %eax /* line number */ movl B_DEST_X, %edi /* x offset */ WRITE_BANK() /* select bank */ leal (%eax, %edi, 2), %edi movl B_SOURCE, %edx /* source bitmap */ movl B_SOURCE_Y, %eax /* line number */ movl B_SOURCE_X, %esi /* x offset */ READ_BANK() /* select bank */ leal (%eax, %esi, 2), %esi movl B_WIDTH, %ecx /* x loop counter */ movl BMP_SEG(%edx), %edx /* load data segment */ movl %edx, %ds std /* backwards */ rep ; movsw /* copy the line */ movl %ebx, %ds /* restore data segment */ decl B_SOURCE_Y decl B_DEST_Y decl B_HEIGHT jg blit_backwards_loop /* and loop */ cld /* finished */ popl %es movl B_SOURCE, %edx UNREAD_BANK() movl B_DEST, %edx UNWRITE_BANK() popl %ebx popl %esi popl %edi movl %ebp, %esp popl %ebp ret /* end of _linear_blit_backward16() */ FUNC(_linear_blit16_end) ret /* void _linear_masked_blit16(BITMAP *source, *dest, int source_x, source_y, * int dest_x, dest_y, int width, height); * Masked (skipping zero pixels) blitting routine for linear bitmaps. */ FUNC(_linear_masked_blit16) pushl %ebp movl %esp, %ebp subl $4, %esp /* one local variable */ pushl %edi pushl %esi pushl %ebx pushl %es #define V_MASK -4(%ebp) movl B_DEST, %edx movl %ds, %ebx movl BMP_SEG(%edx), %edx movl %edx, %es cld movl B_SOURCE, %edx movl BMP_VTABLE(%edx), %edx movl VTABLE_MASK_COLOR(%edx), %eax movl %eax, %ecx shll $16, %eax orw %cx, %ax movl %eax, V_MASK #ifdef ALLEGRO_SSE /* Use SSE if the compiler supports it */ /* Speed improvement on the Pentium 3 only, so we need to check for MMX+ and no 3DNow! */ movl GLOBL(cpu_capabilities), %ecx /* if MMX+ is enabled (or not disabled :) */ andl $CPU_MMXPLUS | $CPU_3DNOW, %ecx cmpl $CPU_MMXPLUS, %ecx jne masked16_no_mmx movl B_WIDTH, %ecx shrl $3, %ecx /* Are there more than 8 pixels? Otherwise, use non-MMX code */ jz masked16_no_mmx movd V_MASK, %mm0 /* Create mask (%mm0) */ movd V_MASK, %mm1 psllq $32, %mm0 por %mm1, %mm0 pcmpeqd %mm4, %mm4 /* Create inverter mask */ /* ??? maskmovq is an SSE instruction! */ #define BLIT_CODE \ movd %ecx, %mm2; /* Save line length (%mm2) */ \ shrl $3, %ecx; \ movl V_MASK, %edx; /* Save 32 bit mask */ \ \ pushl %es; /* Swap ES and DS */ \ pushl %ds; \ popl %es; \ popl %ds; \ \ _align_; \ masked16_mmx_x_loop: \ \ movq %es:(%esi), %mm1; /* Read 4 pixels */ \ movq %mm0, %mm3; \ movq %es:8(%esi), %mm5; /* Read 4 more pixels */ \ movq %mm0, %mm6; \ \ pcmpeqw %mm1, %mm3; /* Compare with mask (%mm3/%mm6) */ \ pcmpeqw %mm5, %mm6; \ pxor %mm4, %mm3; /* Turn 1->0 and 0->1 */ \ pxor %mm4, %mm6; \ addl $16, %esi; /* Update src */ \ maskmovq %mm3, %mm1; /* Write if not equal to mask. */ \ addl $8, %edi; \ maskmovq %mm6, %mm5; \ \ addl $8, %edi; /* Update dest */ \ \ decl %ecx; /* Any pixel packs left for this line? */ \ jnz masked16_mmx_x_loop; \ \ movd %mm2, %ecx; /* Restore pixel count */ \ andl $7, %ecx; \ jz masked16_mmx_loop_end; /* Nothing else to do? */ \ shrl $1, %ecx; /* 1 pixels left */ \ jnc masked16_mmx_long; \ \ movw %es:(%esi), %ax; /* Read 1 pixel */ \ addl $2, %esi; \ addl $2, %edi; \ cmpw %ax, %dx; /* Compare with mask */ \ je masked16_mmx_long; \ movw %ax, -2(%edi); /* Write the pixel */ \ \ masked16_mmx_long: \ \ shrl $1, %ecx; /* 2 pixels left */ \ jnc masked16_mmx_qword; \ \ movl %es:(%esi), %eax; /* Read 2 pixels */ \ addl $4, %esi; \ addl $4, %edi; \ cmpw %ax, %dx; /* Compare with mask */ \ je masked16_mmx_long_2; \ movw %ax, -4(%edi); /* Write pixel */ \ \ masked16_mmx_long_2: \ shrl $16, %eax; \ shrl $16, %edx; \ cmpl %eax, %edx; \ je masked16_mmx_qword; \ movw %ax, -2(%edi); \ \ _align_; \ masked16_mmx_qword: \ shrl $1, %ecx; /* 4 pixels left */ \ jnc masked16_mmx_loop_end; \ \ movq %es:(%esi), %mm1; /* Read 4 more pixels */ \ movq %mm0, %mm3; \ \ pcmpeqw %mm1, %mm3; /* Compare with mask (%mm3, %mm6) */ \ pxor %mm4, %mm3; /* Turn 1->0 and 0->1 */ \ maskmovq %mm3, %mm1; /* Write if not equal to mask. */ \ \ _align_; \ masked16_mmx_loop_end: \ \ pushl %ds; /* Swap back ES and DS */ \ popl %es; BLIT_LOOP(masked16_mmx_loop, 2, BLIT_CODE) #undef BLIT_CODE emms jmp masked16_end; #endif _align_ masked16_no_mmx: #define BLIT_CODE \ movl V_MASK, %edx ; \ \ test $1, %ecx ; /* 32 bit aligned->use new code */ \ jz masked32_blit_x_loop ; \ movw (%esi), %ax ; /* read a pixel */ \ cmpw %ax, %dx ; /* test it */ \ je masked16_blit_skip ; \ movw %ax, %es:(%edi) ; /* write the pixel */ \ masked16_blit_skip: \ decl %ecx ; \ jng masked32_blit_end ; \ addl $2, %esi ; \ addl $2, %edi ; \ \ _align_ ; \ masked32_blit_x_loop: \ movl (%esi), %eax ; /* read two pixels */ \ addl $4, %esi ; \ cmpl %eax, %edx ; /* test it */ \ je masked32_blit_skip2 ; \ cmpw %ax, %dx ; /* test it */ \ je masked32_blit_skip1 ; \ movw %ax, %es:(%edi) ; /* write the pixel */ \ masked32_blit_skip1: \ shrl $16, %eax ; \ cmpw %ax, %dx ; /* test it */ \ je masked32_blit_skip2 ; \ movw %ax, %es:2(%edi) ; /* write the pixel */ \ masked32_blit_skip2: \ addl $4, %edi ; \ subl $2, %ecx ; \ jg masked32_blit_x_loop ; \ masked32_blit_end: BLIT_LOOP(masked32, 2, BLIT_CODE) #undef BLIT_CODE masked16_end: popl %es /* the source must be a memory bitmap, no need for * movl B_SOURCE, %edx * UNREAD_BANK() */ movl B_DEST, %edx UNWRITE_BANK() popl %ebx popl %esi popl %edi movl %ebp, %esp popl %ebp ret /* end of _linear_masked_blit16() */ #endif /* ifdef ALLEGRO_COLOR16 */