/*         ______   ___    ___ 
 *        /\  _  \ /\_ \  /\_ \ 
 *        \ \ \L\ \\//\ \ \//\ \      __     __   _ __   ___ 
 *         \ \  __ \ \ \ \  \ \ \   /'__`\ /'_ `\/\`'__\/ __`\
 *          \ \ \/\ \ \_\ \_ \_\ \_/\  __//\ \L\ \ \ \//\ \L\ \
 *           \ \_\ \_\/\____\/\____\ \____\ \____ \ \_\\ \____/
 *            \/_/\/_/\/____/\/____/\/____/\/___L\ \/_/ \/___/
 *                                           /\____/
 *                                           \_/__/
 *
 *      256 color bitmap blitting (written for speed, not readability :-)
 *
 *      By Shawn Hargreaves.
 *
 *      Stefan Schimanski optimised the reverse blitting function.
 *
 *      MMX clear code by Robert Ohannessian.
 *
 *      See readme.txt for copyright information.
 */


#include "asmdefs.inc"
#include "blit.inc"

#ifdef ALLEGRO_COLOR8

.text


/* void _linear_clear_to_color8(BITMAP *bitmap, int color);
 *  Fills a linear bitmap with the specified color.
 */
FUNC(_linear_clear_to_color8)
   pushl %ebp
   movl %esp, %ebp
   pushl %edi
   pushl %esi
   pushl %ebx
   movl ARG1, %edx               /* edx = bmp */
   pushl %es 

   movl BMP_SEG(%edx), %eax      /* select segment */
   movl %eax, %es

   movl BMP_CT(%edx), %ebx       /* line to start at */

   movl BMP_CR(%edx), %esi       /* width to clear */
   subl BMP_CL(%edx), %esi

#ifdef ALLEGRO_MMX               /* only use MMX if compiler supports it */

   movl GLOBL(cpu_capabilities), %eax     /* if MMX is enabled (or not disabled :) */
   andl $CPU_MMX, %eax
   jz clear_no_mmx

   movl %esi, %eax               /* if less than 32 pixels, use non-MMX */
   shrl $5, %eax
   orl %eax, %eax
   jz clear_no_mmx

   movb ARG2, %al                /* duplicate color 4 times */
   movb %al, %ah
   shll $16, %eax
   movb ARG2, %al 
   movb %al, %ah

   pushl %eax

   movl %ds, %eax
   movl %es, %ecx

   cmpw %ax, %cx                 /* can we use nearptr ? */
   jne clearMMXseg_loop          /* if not, then we have to decode segments...*/
				 /* else, we save one cycle per 8 pixels on PMMX/K6 */
   _align_
clearMMX_loop:
   movl %ebx, %eax
   WRITE_BANK()                  /* select bank */
   movl %eax, %edi 
   addl BMP_CL(%edx), %edi       /* get line address  */

   popl %eax                     /* get eax back */

   movl %esi, %ecx               /* width to clear */

   movd %eax, %mm0               /* restore mmx reg 0 in case it's been clobbered by WRITE_BANK() */
   movd %eax, %mm1
   psllq $32, %mm0
   por %mm1, %mm0

   pushl %eax                    /* save eax */

   testl $7, %edi                /* is destination aligned on 64-bit ? */
   jz clearMMX_aligned

clearMMX_do_alignment:
   movl %edi, %eax               /* we want to adjust %ecx  (pairing: see andl) */

   movq %mm0, (%edi)             /* we clear 8 pixels */

   andl $7, %eax                 /* we calc how may pixels we actually wanted to clear (8 - %eax) (see subl) */

   andl $0xFFFFFFF8, %edi        /* instruction pairing (see inc %edi) */

   subl $8, %eax

   addl $8, %edi                 /* we set %edi to the next aligned memory address */

   addl %eax, %ecx               /* and adjust %ecx to reflect the change */

clearMMX_aligned:
   movl %ecx, %eax               /* save for later */
   shrl $5, %ecx                 /* divide by 32 for 4 * 8-byte memory move */
   jz clearMMX_finish_line       /* if there's less than 32 pixels to clear, no need for MMX */

clearMMX_continue_line:
   movq %mm0, (%edi)             /* move 4x 8 bytes */
   movq %mm0, 8(%edi)            /* MMX instructions can't pair when both write to memory */
   movq %mm0, 16(%edi)
   movq %mm0, 24(%edi)
   addl $32, %edi                /* inserting those in the MMX copy block makes no diffrence */
   decl %ecx
   jnz clearMMX_continue_line

clearMMX_finish_line:
   movl %eax, %ecx               /* get ecx back */

   testl $31, %ecx               /* check if there's any left */
   jz clearMMX_no_long
				 /* else, write trailing pixels */
   testl $16, %ecx
   jz clearMMX_finish_line2

   movq %mm0, (%edi)
   movq %mm0, 8(%edi)
   addl $16, %edi 

clearMMX_finish_line2:
   testl $8, %ecx
   jz clearMMX_finish_line3

   movq %mm0, (%edi)
   addl $8, %edi

clearMMX_finish_line3:
   andl $7, %ecx
   subl $8, %ecx

   movq %mm0, (%edi, %ecx)

clearMMX_no_long:
   incl %ebx
   cmpl %ebx, BMP_CB(%edx)
   jg clearMMX_loop              /* and loop */

   popl %eax

   emms                          /* clear FPU tag word */

   jmp clear_done

clearMMXseg_loop:
   movl %ebx, %eax
   WRITE_BANK()                  /* select bank */
   movl %eax, %edi 
   addl BMP_CL(%edx), %edi       /* get line address  */

   popl %eax                     /* Get eax back */

   movl %esi, %ecx               /* width to clear */

   movd %eax, %mm0               /* restore mmx reg 0 in case it's been clobbered by WRITE_BANK() */
   movd %eax, %mm1
   psllq $32, %mm0
   por %mm1, %mm0

   pushl %eax                    /* save eax */

   testl $7, %edi                /* is destination aligned on 64-bit ? */
   jz clearMMXseg_aligned

clearMMXseg_do_alignment:
   movl %edi, %eax               /* we want to adjust %ecx  (pairing: see andl) */

   movq %mm0, %es:(%edi)         /* we clear 8 pixels */

   andl $7, %eax                 /* we calc how may pixels we actually wanted to clear (8 - %eax) (see subl) */

   andl $0xFFFFFFF8, %edi        /* instruction pairing (see inc %edi) */

   subl $8, %eax

   addl $8, %edi                 /* we set %edi to the next aligned memory address */

   addl %eax, %ecx               /* and adjust %ecx to reflect the change */

clearMMXseg_aligned:
   movl %ecx, %eax               /* save for later */
   shrl $5, %ecx                 /* divide by 32 for 4 * 8-byte memory move */
   jz clearMMXseg_finish_line    /* if there's less than 32 pixels to clear, no need for MMX */

clearMMXseg_continue_line:
   movq %mm0, %es:(%edi)         /* move 4x 8 bytes */
   movq %mm0, %es:8(%edi)        /* MMX instructions can't pair when both write to memory */
   movq %mm0, %es:16(%edi)
   movq %mm0, %es:24(%edi)
   addl $32, %edi                /* inserting those in the MMX copy block makes no diffrence */
   decl %ecx
   jnz clearMMXseg_continue_line

clearMMXseg_finish_line:
   movl %eax, %ecx               /* get ecx back */

   testl $31, %ecx               /* check if there's any left */
   jz clearMMXseg_no_long
				 /* else, write trailing pixels */
   testl $16, %ecx 
   jz clearMMXseg_finish_line2

   movq %mm0, %es:(%edi)
   movq %mm0, %es:8(%edi)
   addl $16, %edi 

clearMMXseg_finish_line2:
   testl $8, %ecx
   jz clearMMXseg_finish_line3

   movq %mm0, %es:(%edi)
   addl $8, %edi

clearMMXseg_finish_line3:
   andl $7, %ecx
   subl $8, %ecx

   movq %mm0, %es:(%edi, %ecx)

clearMMXseg_no_long:
   incl %ebx
   cmpl %ebx, BMP_CB(%edx)
   jg clearMMXseg_loop           /* and loop */

   popl %eax

   emms                          /* clear FPU tag word */

   jmp clear_done

#endif                           /* ALLEGRO_MMX */

clear_no_mmx:
   cld

   _align_
clear_loop:
   movl %ebx, %eax
   WRITE_BANK()                  /* select bank */
   movl %eax, %edi 
   addl BMP_CL(%edx), %edi       /* get line address  */

   movb ARG2, %al                /* duplicate color 4 times */
   movb %al, %ah
   shll $16, %eax
   movb ARG2, %al 
   movb %al, %ah

   movl %esi, %ecx               /* width to clear */
   shrl $1, %ecx                 /* halve for 16 bit clear */
   jnc clear_no_byte
   stosb                         /* clear an odd byte */

clear_no_byte:
   shrl $1, %ecx                 /* halve again for 32 bit clear */
   jnc clear_no_word
   stosw                         /* clear an odd word */

clear_no_word:
   jz clear_no_long 

   _align_
clear_x_loop:
   rep ; stosl                   /* clear the line */

clear_no_long:
   incl %ebx
   cmpl %ebx, BMP_CB(%edx)
   jg clear_loop                 /* and loop */

clear_done:
   popl %es

   UNWRITE_BANK()

   popl %ebx   
   popl %esi
   popl %edi
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of _linear_clear_to_color8() */


/* void _linear_blit8(BITMAP *source, BITMAP *dest, int source_x, source_y, 
 *                                    int dest_x, dest_y, int width, height);
 *  Normal forwards blitting routine for linear bitmaps.
 */
FUNC(_linear_blit8)
   pushl %ebp
   movl %esp, %ebp
   pushl %edi
   pushl %esi
   pushl %ebx
   pushl %es

   movl B_DEST, %edx
   movl %ds, %ebx                /* save data segment selector */
   movl BMP_SEG(%edx), %eax      /* load destination segment */
   movl %eax, %es
   cld                           /* for forward copy */

   shrl $1, B_WIDTH              /* halve counter for word copies */
   jz blit_only_one_byte
   jnc blit_even_bytes

   _align_
   BLIT_LOOP(words_and_byte, 1,  /* word at a time, plus leftover byte */
      rep ; movsw ;
      movsb
   )
   jmp blit_done

   _align_
blit_even_bytes: 
   shrl $1, B_WIDTH              /* halve counter again, for long copies */
   jz blit_only_one_word
   jnc blit_even_words

   _align_
   BLIT_LOOP(longs_and_word, 1,  /* long at a time, plus leftover word */
      rep ; movsl ;
      movsw
   )
   jmp blit_done

   _align_
blit_even_words: 
   BLIT_LOOP(even_words, 1,      /* copy a long at a time */
      rep ; movsl ;
   )
   jmp blit_done

   _align_
blit_only_one_byte: 
   BLIT_LOOP(only_one_byte, 1,   /* copy just the one byte */
      movsb
   )
   jmp blit_done

   _align_
blit_only_one_word: 
   BLIT_LOOP(only_one_word, 1,   /* copy just the one word */
      movsw
   )

   _align_
blit_done:
   popl %es

   movl B_SOURCE, %edx
   UNREAD_BANK()

   movl B_DEST, %edx
   UNWRITE_BANK()

   popl %ebx
   popl %esi
   popl %edi
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of _linear_blit8() */


/* void _linear_blit_backward8(BITMAP *source, BITMAP *dest, int source_x, 
 *                      int source_y, int dest_x, dest_y, int width, height);
 *  Reverse blitting routine, for overlapping linear bitmaps.
 */
FUNC(_linear_blit_backward8)
   pushl %ebp
   movl %esp, %ebp
   pushl %edi
   pushl %esi
   pushl %ebx
   pushl %es

   movl B_HEIGHT, %eax           /* y values go from high to low */
   decl %eax
   addl %eax, B_SOURCE_Y
   addl %eax, B_DEST_Y

   movl B_WIDTH, %eax            /* x values go from high to low */
   decl %eax
   addl %eax, B_SOURCE_X
   addl %eax, B_DEST_X

   movl B_DEST, %edx
   movl %ds, %ebx                /* save data segment selector */
   movl BMP_SEG(%edx), %eax      /* load destination segment */
   movl %eax, %es

   movl B_SOURCE_Y, %eax         /* if different line -> fast dword blit */
   cmpl B_DEST_Y, %eax
   jne blit_backwards_loop_fast

   movl B_SOURCE_X, %eax         /* B_SOURCE_X-B_DEST_X */
   subl B_DEST_X, %eax
   cmpl $3, %eax                 /* if greater than 3 -> fast dword blit */
   jg blit_backwards_loop_fast

   _align_
blit_backwards_loop_slow:
   movl B_DEST, %edx             /* destination bitmap */
   movl B_DEST_Y, %eax           /* line number */
   movl B_DEST_X, %edi           /* x offset */
   WRITE_BANK()                  /* select bank */
   addl %eax, %edi

   movl B_SOURCE, %edx           /* source bitmap */
   movl B_SOURCE_Y, %eax         /* line number */
   movl B_SOURCE_X, %esi         /* x offset */
   READ_BANK()                   /* select bank */
   addl %eax, %esi

   std                           /* backwards */
   movl B_WIDTH, %ecx            /* x loop counter */
   movl BMP_SEG(%edx), %eax      /* load data segment */
   movl %eax, %ds
   rep ; movsb                   /* copy the line */

   movl %ebx, %ds                /* restore data segment */
   decl B_SOURCE_Y
   decl B_DEST_Y
   decl B_HEIGHT
   jg blit_backwards_loop_slow   /* and loop */

   jmp blit_backwards_end

   _align_
blit_backwards_loop_fast:
   movl B_DEST, %edx             /* destination bitmap */
   movl B_DEST_Y, %eax           /* line number */
   movl B_DEST_X, %edi           /* x offset */
   WRITE_BANK()                  /* select bank */
   addl %eax, %edi

   movl B_SOURCE, %edx           /* source bitmap */
   movl B_SOURCE_Y, %eax         /* line number */
   movl B_SOURCE_X, %esi         /* x offset */
   READ_BANK()                   /* select bank */
   addl %eax, %esi

   std                           /* backwards */
   movl B_WIDTH, %eax            /* x loop counter */
   movl BMP_SEG(%edx), %edx      /* load data segment */
   movl %edx, %ds

   movl %eax, %ecx
   andl $3, %ecx                 /* copy bytes */
   rep ; movsb                   /* copy the line */

   subl $3, %esi
   subl $3, %edi

   movl %eax, %ecx
   shrl $2, %ecx                 /* copy dwords */
   rep ; movsl                   /* copy the line */

   movl %ebx, %ds                /* restore data segment */
   decl B_SOURCE_Y
   decl B_DEST_Y
   decl B_HEIGHT
   jg blit_backwards_loop_fast   /* and loop */

blit_backwards_end:
   cld                           /* finished */

   popl %es

   movl B_SOURCE, %edx
   UNREAD_BANK()

   movl B_DEST, %edx
   UNWRITE_BANK()

   popl %ebx
   popl %esi
   popl %edi
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of _linear_blit_backward8() */

FUNC(_linear_blit8_end)
   ret


/* void _linear_masked_blit8(BITMAP *source, *dest, int source_x, source_y, 
 *                           int dest_x, dest_y, int width, height);
 *  Masked (skipping zero pixels) blitting routine for linear bitmaps.
 */
FUNC(_linear_masked_blit8)
   pushl %ebp
   movl %esp, %ebp
   pushl %edi
   pushl %esi
   pushl %ebx
   pushl %es

   movl B_DEST, %edx
   movl %ds, %ebx 
   movl BMP_SEG(%edx), %eax
   movl %eax, %es
   cld 

#ifdef ALLEGRO_SSE  /* Use SSE if the compiler supports it */
      
   /* Speed improvement on the Pentium 3 only, so we need to check for MMX+ and no 3DNow! */
   movl GLOBL(cpu_capabilities), %ecx     /* if MMX+ is enabled (or not disabled :) */
   andl $CPU_MMXPLUS | $CPU_3DNOW, %ecx
   cmpl $CPU_MMXPLUS, %ecx
   jne masked8_no_mmx


   movl B_WIDTH, %ecx
   shrl $4, %ecx                 /* Are there more than 16 pixels? Otherwise, use non-MMX code */
   jz masked8_no_mmx

   pxor %mm0, %mm0
   
   pcmpeqd %mm4, %mm4            /* Create inverter mask */
  
   /* Note: maskmovq is an SSE instruction! */

   #define BLIT_CODE                                                                 \
      movd %ecx, %mm2;            /* Save line length (%mm2) */                      \
      shrl $4, %ecx;                                                                 \
                                                                                     \
      pushl %es;  /* Swap ES and DS */                                               \
      pushl %ds;                                                                     \
      popl  %es;                                                                     \
      popl  %ds;                                                                     \
                                                                                     \
      _align_;                                                                       \
      masked8_mmx_x_loop:                                                            \
                                                                                     \
      movq %es:(%esi), %mm1;       /* Read 8 pixels */                               \
      movq %mm0, %mm3;                                                               \
      movq %es:8(%esi), %mm5;      /* Read 8 more pixels */                          \
      movq %mm0, %mm6;                                                               \
                                                                                     \
      pcmpeqb %mm1, %mm3;         /* Compare with mask (%mm3/%mm6) */                \
      pcmpeqb %mm5, %mm6;                                                            \
      pxor %mm4, %mm3;            /* Turn 1->0 and 0->1 */                           \
      pxor %mm4, %mm6;                                                               \
      addl $16, %esi;             /* Update src */                                   \
      maskmovq %mm3, %mm1;        /* Write if not equal to mask. */                  \
      addl $8, %edi;                                                                 \
      maskmovq %mm6, %mm5;                                                           \
                                                                                     \
      addl $8, %edi;              /* Update dest */                                  \
                                                                                     \
      decl %ecx;                  /* Any pixel packs left for this line? */          \
      jnz masked8_mmx_x_loop;                                                        \
                                                                                     \
      movd %mm2, %ecx;            /* Restore pixel count */                          \
      andl $15, %ecx;                                                                \
      jz masked8_mmx_loop_end;    /* Nothing else to do? */                          \
      shrl $1, %ecx;              /* 1 pixels left */                                \
      jnc masked8_mmx_word;                                                          \
                                                                                     \
      movb %es:(%esi), %al;       /* Read 1 pixel */                                 \
      incl %esi;                                                                     \
      incl %edi;                                                                     \
      orb %al, %al;               /* Compare with mask */                            \
      jz masked8_mmx_word;                                                           \
      movb %al, -1(%edi);         /* Write the pixel */                              \
                                                                                     \
      masked8_mmx_word:                                                              \
      shrl $1, %ecx;              /* 2 pixels left */                                \
      jnc masked8_mmx_long;                                                          \
                                                                                     \
      movb %es:(%esi), %al;       /* Read 2 pixels */                                \
      movb %es:1(%esi), %ah;                                                         \
      addl $2, %esi;                                                                 \
      addl $2, %edi;                                                                 \
      orb %al, %al;                                                                  \
      jz masked8_mmx_word_2;                                                         \
      movb %al, -2(%edi);         /* Write pixel */                                  \
                                                                                     \
      masked8_mmx_word_2:                                                            \
      orb %ah, %ah;                                                                  \
      jz masked8_mmx_long;                                                           \
      movb %ah, -1(%edi);         /* Write other pixel */                            \
                                                                                     \
      _align_;                                                                       \
      masked8_mmx_long:                                                              \
                                                                                     \
      shrl $1, %ecx;              /* 4 pixels left */                                \
      jnc masked8_mmx_qword;                                                         \
                                                                                     \
      movl %es:(%esi), %eax;      /* Read 4 pixels */                                \
      addl $4, %esi;                                                                 \
      movd %eax, %mm1;                                                               \
      movl $-1, %eax;                                                                \
      movq %mm0, %mm3;                                                               \
      movd %eax, %mm5;            /* Build XOR flag */                               \
                                                                                     \
      pcmpeqb %mm1, %mm3;         /* Compare with mask (%mm3/%mm6) */                \
      pxor %mm5, %mm3;            /* Turn 1->0 and 0->1 */                           \
      pand %mm5, %mm3;            /* Make sure only the bottom 32 bits are used */   \
      maskmovq %mm3, %mm1;        /* Write if not equal to mask. */                  \
      addl $4, %edi;                                                                 \
                                                                                     \
      _align_;                                                                       \
      masked8_mmx_qword:                                                             \
      shrl $1, %ecx;              /* 8 pixels left */                                \
      jnc masked8_mmx_loop_end;                                                      \
                                                                                     \
      movq %es:(%esi), %mm1;      /* Read 8 more pixels */                           \
      movq %mm0, %mm3;                                                               \
                                                                                     \
      pcmpeqw %mm1, %mm3;         /* Compare with mask (%mm3) */                     \
      pxor %mm4, %mm3;            /* Turn 1->0 and 0->1 */                           \
      maskmovq %mm3, %mm1;        /* Write if not equal to mask. */                  \
                                                                                     \
      _align_;                                                                       \
      masked8_mmx_loop_end:                                                          \
                                                                                     \
      pushl %ds;                  /* Swap back ES and DS */                          \
      popl %es;
   BLIT_LOOP(masked8_mmx_loop, 1, BLIT_CODE)
   #undef BLIT_CODE
   
   emms
   
   jmp masked8_end;
   
#endif
   
	_align_
	masked8_no_mmx:


   #define BLIT_CODE                                                  \
      test $1, %ecx ;            /* 16 bit aligned->use new code */   \
      jz masked16_blit ;         /* width 16 bit aligned */           \
      movb (%esi), %al ;         /* read a byte */                    \
      incl %esi ;                                                     \
      orb %al, %al ;             /* test it */                        \
      jz masked8_skip ;                                               \
      movb %al, %es:(%edi) ;     /* write the pixel */                \
   masked8_skip:                                                      \
      incl %edi ;                                                     \
      decl %ecx ;                                                     \
      jng masked16_blit_end ;                                         \
                                                                      \
   masked16_blit:                                                     \
      test $3, %ecx ;            /* 32 bit aligned->use new code */   \
      jz masked16_blit_x_loop ;  /* width 32 bit aligned */           \
      movw (%esi), %ax ;         /* read two pixels */                \
      orw %ax, %ax ;                                                  \
      jz masked16_blit_end2 ;                                         \
      orb %al,%al ;                                                   \
      jz masked16_blit_wskip1 ;                                       \
      orb %ah,%ah ;                                                   \
      jz masked16_blit_p1wskip2 ;                                     \
      movw %ax, %es:(%edi) ;     /* write the pixel */                \
      jmp masked16_blit_end2 ;                                        \
      _align_ ;                                                       \
   masked16_blit_p1wskip2:                                            \
      movb %al, %es:(%edi) ;    /* write the pixel */                 \
      jmp masked16_blit_end2 ;                                        \
      _align_ ;                                                       \
   masked16_blit_wskip1:                                              \
      movb %ah, %es:1(%edi) ;    /* write the pixel */                \
      _align_ ;                                                       \
   masked16_blit_end2:                                                \
      subl $2, %ecx ;                                                 \
      jng masked16_blit_end ;                                         \
      addl $2, %esi ;                                                 \
      addl $2, %edi ;                                                 \
                                                                      \
      _align_ ;                                                       \
   masked16_blit_x_loop:                                              \
      movl (%esi), %eax ;         /* read four pixels */              \
      addl $4, %esi ;                                                 \
      movl %eax, %edx ;                                               \
      shrl $16,%edx ;                                                 \
      orl %eax, %eax ;                                                \
      jz masked16_blit_skip4 ;                                        \
      orw %ax, %ax ;                                                  \
      jz masked16_blit_skip2 ;                                        \
      orb %al,%al ;                                                   \
      jz masked16_blit_skip1 ;                                        \
      orb %ah, %ah ;                                                  \
      jz masked16_put1skip2 ;                                         \
      orb %dl,%dl ;                                                   \
      jz masked16_put12_skip3 ;                                       \
      orb %dh,%dh ;                                                   \
      jz masked16_put123_skip4 ;                                      \
      movl %eax, %es:(%edi) ;     /* write the pixel */               \
      jmp masked16_blit_skip4 ;                                       \
                                                                      \
      _align_ ;                                                       \
   masked16_put1skip2:                                                \
      movb %al, %es:(%edi) ;     /* write the pixel */                \
      jmp masked16_blit_skip2 ;                                       \
      _align_ ;                                                       \
   masked16_put12_skip3:                                              \
      movw %ax, %es:(%edi) ;     /* write the pixel */                \
      orb %dh, %dh ;                                                  \
      jnz masked16_blit_skip3 ;                                       \
      jmp masked16_blit_skip4 ;                                       \
      _align_ ;                                                       \
   masked16_put123_skip4:                                             \
      movw %ax, %es:(%edi) ;     /* write the pixel */                \
      movb %dl, %es:2(%edi) ;     /* write the pixel */               \
      addl $4, %edi ;                                                 \
      subl $4, %ecx ;                                                 \
      jg masked16_blit_x_loop ;                                       \
      jmp masked16_blit_end ;                                         \
                                                                      \
   masked16_blit_skip1:                                               \
      movb %ah, %es:1(%edi) ;    /* write the pixel */                \
   masked16_blit_skip2:                                               \
      orw %dx, %dx ;                                                  \
      jz masked16_blit_skip4 ;                                        \
      orb %dl,%dl ;                                                   \
      jz masked16_blit_skip3 ;                                        \
      orb %dh, %dh ;                                                  \
      jz masked16_put3skip4 ;                                         \
      movw %dx, %es:2(%edi) ;     /* write the pixel */               \
      jmp masked16_blit_skip4 ;                                       \
                                                                      \
      _align_ ;                                                       \
   masked16_put3skip4:                                                \
      movb %dl, %es:2(%edi) ;     /* write the pixel */               \
      addl $4, %edi ;                                                 \
      subl $4, %ecx ;                                                 \
      jg masked16_blit_x_loop ;                                       \
      jmp masked16_blit_end ;                                         \
                                                                      \
   masked16_blit_skip3:                                               \
      movb %dh, %es:3(%edi) ;    /* write the pixel */                \
   masked16_blit_skip4:                                               \
      addl $4, %edi ;                                                 \
      subl $4, %ecx ;                                                 \
      jg masked16_blit_x_loop ;                                       \
   masked16_blit_end:
   BLIT_LOOP(masked16, 1, BLIT_CODE)
   #undef BLIT_CODE

masked8_end:

   popl %es

   /* the source must be a memory bitmap, no need for
    *  movl B_SOURCE, %edx
    *  UNREAD_BANK()
    */

   movl B_DEST, %edx
   UNWRITE_BANK()

   popl %ebx
   popl %esi
   popl %edi
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of _linear_masked_blit8() */


#endif      /* ifdef ALLEGRO_COLOR8 */