
 .align

 .data
 storage:
 .string	"0123012301230123012301230123012301230123"
 storage2:
 .string	"01230123012301230123"

 .text
.align
  
 .global blitNormalAligned				// standard blit
 
 .global blitNormalBlendAligned
 .global blitNormalBlend50Aligned

 .global fillSurface					// fill entire surface with a single color
 .global fillRectAligned
 .global fillRectBlend50Aligned				// fill rectangle blending it with the background
 .global fillRectBlendAligned 
	
 /*.type	memc,#function
*/


//----------------------------------------------------------------------------
/*
*
*	Standard memory to memory blit with no transparency 
*       Handles case where both source and dest are either aligned or not aligned
*	
*	void blitNormalAligned(void *str);
*	
*	str{	
*		0   =destination
*		+4  =source
*		+8  =width of the source rectangle to be transfered
*		+12 =height of the source rectangle to be transfered
*		+16 =source step  (SourceTotalWidth- transferWidth) in bytes
*		+20 =destination step (DestTotalWidth -transferWidth) in bytes   
*	}
*	
*/
//----------------------------------------------------------------------------

blitNormalAligned:
	stmdb   sp!, { r4 - r12, lr }

	ldr	r3,=storage
	str	r13,[r3]

	mov 	r11,r0
	ldr	r0,[r11] 			// load destination   	
	ldr	r1,[r11,#4]			// load source
	
	ldr	r2,[r11,#8]			// load source width
	ldr	r13,[r11,#12]			// load source height
	ldr	r3,[r11,#16]			// load source step
	ldr	lr,[r11,#20]			// load destination step

	ldr	r10,=storage			// save steps 
	str	r3,[r10,#4]

	mov	r12,r2
	

     
	ands	r3,r1,#3
	beq	blitNormalAligned_TestDest
	ands	r3,r0,#3
	beq	blitNormalDestAligned_Start
	b       blitNormalAligned_Line
	
blitNormalAligned_TestDest:
	ands	r3,r0,#3
	bne	blitNormalSourceAligned_Start
	

blitNormalAligned_Line:

	ands	r6,r0,#3
	beq     blitNormalAligned_copyoct
	ldrh	r6,[r1],#2
	strh	r6,[r0],#2
        sub	r2,r2, #1


blitNormalAligned_copyoct:

	movs	r3,r2, lsr #4	
	beq	blitNormalAligned_copywords
		
blitNormalAligned_octcopy:     

	ldmia	r1!,{r4-r11}
	subs	r3,r3, #1
	stmia	r0!,{r4-r11}     	
	bne	blitNormalAligned_octcopy     
  
blitNormalAligned_copywords:

	
	mov	r5,r2,lsr #1
	ands	r5,r5,#7
	beq	blitNormalAligned_pixelcopy
		
blitNormalAligned_wordcopy:    

       	ldr	r3, [r1], #4
	subs	r5,r5, #1  	
	str	r3, [r0], #4      
	bne 	blitNormalAligned_wordcopy

blitNormalAligned_pixelcopy:

	ands	r2,r2, #1
	beq	blitNormalAligned_done

blitNormalAligned_copypixels:

	ldrh	r3,[r1], #2
	strh	r3,[r0], #2

blitNormalAligned_done:

        
	ldr	r11,=storage
	add	r11,r11,#4
	ldr	r10,[r11],#4			// load step of the source
	
	add	r1,r1,r10
	add	r0,r0,lr
	
	mov	r2,r12
	
	subs	r13,r13,#1
	bne	blitNormalAligned_Line			// vertical loop

blitNormalAligned_exit:

	ldr	r3,=storage
	ldr	r13,[r3]

	ldr	r0,=0
        ldmia   sp!, { r4 - r12, pc }


// Source only aligned version

blitNormalSourceAligned_Start:
	str	lr,[r10,#8]

blitNormalSourceAligned_Line:

	movs	r3,r2, lsr #4	
	beq	blitNormalSourceAligned_copywords
		
blitNormalSourceAligned_octcopy:     

	ldmia	r1!,{r4-r11}
	
	strh	r4, [r0], #2  
		
	mov	lr,r5, lsl #16
	orr	r4,lr,r4,lsr #16
	mov	lr,r6,lsl #16
	orr	r5,lr,r5,lsr #16
	mov	lr,r7, lsl #16
	orr	r6,lr,r6,lsr #16
	mov	lr,r8,lsl #16
	orr	r7,lr,r7,lsr #16	
	mov	lr,r9, lsl #16
	orr	r8,lr,r8,lsr #16
	mov	lr,r10,lsl #16
	orr	r9,lr,r9,lsr #16	
	mov	lr,r11, lsl #16
	orr	r10,lr,r10,lsr #16
	
	stmia	r0!,{r4-r10}  
	mov	r11,r11,lsr #16	
	strh	r11,[r0],#2
	     
	subs	r3,r3, #1
	bne	blitNormalSourceAligned_octcopy     
  
blitNormalSourceAligned_copywords:

	
	mov	r5,r2,lsr #1
	ands	r5,r5,#7
	beq	blitNormalSourceAligned_pixelcopy
		
blitNormalSourceAligned_wordcopy:    

       	ldr	r3, [r1], #4
	subs	r5,r5, #1   	
	strh	r3, [r0], #2
	mov	r3,r3,lsr #16
	strh	r3, [r0], #2         
	bne 	blitNormalSourceAligned_wordcopy

blitNormalSourceAligned_pixelcopy:

	ands	r2,r2, #1
	beq	blitNormalSourceAligned_done

blitNormalSourceAligned_copypixels:

	ldrh	r3,[r1], #2
	strh	r3,[r0], #2


blitNormalSourceAligned_done:

        
	ldr	r11,=storage
	add	r11,r11,#4
	ldr	r10,[r11],#4			// load step of the source
	ldr 	r9,[r11],#4			// load step of the destination
	
	add	r1,r1,r10
	add	r0,r0,r9
	
	mov	r2,r12
	
	subs	r13,r13,#1
	bne	blitNormalSourceAligned_Line			// vertical loop


	ldr	r3,=storage
	ldr	r13,[r3]

        ldr	r0,=1
        ldmia   sp!, { r4 - r12, pc }


// Destination only aligned version


blitNormalDestAligned_Start:
        str	lr,[r10,#8]


blitNormalDestAligned_Line:

	movs	r3,r2, lsr #4	
	beq	blitNormalDestAligned_copywords
		
blitNormalDestAligned_octcopy:     

	ldrh	r4,[r1], #2
	ldmia	r1!,{r5-r11}
	orr	r4,r4,r5, lsl #16
		
	mov	lr, r5, lsr #16
	orr	r5,lr,r6, lsl #16
	mov	lr,r6,lsr #16
	orr	r6,lr,r7,lsl #16
	mov	lr,r7,lsr #16
	orr	r7,lr,r8,lsl #16
	mov	lr,r8,lsr #16
	orr	r8,lr,r9,lsl #16
	mov	lr,r9,lsr #16
	orr	r9,lr,r10,lsl #16
	mov	lr,r10,lsr #16
	orr	r10,lr,r11,lsl #16
	
	ldrh	lr,[r1],#2	
	mov	r11,r11, lsr #16
	orr	r11,r11,lr, lsl #16

	stmia	r0!,{r4-r11}  
	     
	subs	r3,r3, #1
	bne	blitNormalDestAligned_octcopy     
  
blitNormalDestAligned_copywords:

	
	mov	r5,r2,lsr #1
	ands	r5,r5,#7
	beq	blitNormalDestAligned_pixelcopy
		
blitNormalDestAligned_wordcopy:    

	ldrh	r3, [r1], #2
	subs	r5,r5, #1     		
	ldrh	r4, [r1], #2
	orr	r3,r3,r4, lsl #16
	str	r3,[r0],#4        
	bne 	blitNormalDestAligned_wordcopy

blitNormalDestAligned_pixelcopy:

	ands	r2,r2, #1
	beq	blitNormalDestAligned_done

blitNormalDestAligned_copypixels:

	ldrh	r3,[r1], #2
	strh	r3,[r0], #2

blitNormalDestAligned_done:

        
	ldr	r11,=storage
	add	r11,r11,#4
	ldr	r10,[r11],#4			// load step of the source
	ldr 	r9,[r11],#4			// load step of the destination
	
	add	r1,r1,r10
	add	r0,r0,r9
	
	mov	r2,r12
	
	subs	r13,r13,#1
	bne	blitNormalDestAligned_Line			// vertical loop


	ldr	r3,=storage
	ldr	r13,[r3]

        ldr	r0,=11
        ldmia   sp!, { r4 - r12, pc }



//----------------------------------------------------------------------------
/*
*
*	Standard memory to memory blit with no transparency 
*       Handles case where both source and dest are either aligned or not aligned
*	
*	void blitNormalBlend50Aligned(void *str);
*	
*	str{	
*		0   =destination
*		+4  =source
*		+8  =width of the source rectangle to be transfered
*		+12 =height of the source rectangle to be transfered
*		+16 =source step  (SourceTotalWidth- transferWidth) in bytes
*		+20 =destination step (DestTotalWidth -transferWidth) in bytes   
*	}
*	
*/
//----------------------------------------------------------------------------

blitNormalBlend50Aligned:
	stmdb   sp!, { r4 - r12, lr }

	ldr	r3,=storage
	str	r13,[r3]

	mov 	r11,r0
	ldr	r0,[r11] 			// load destination   	
	ldr	r1,[r11,#4]			// load source
	
	ldr	r2,[r11,#8]			// load source width
	ldr	r13,[r11,#12]			// load source height
	ldr	r3,[r11,#16]			// load source step
	ldr	lr,[r11,#20]			// load destination step

	ldr	r10,=storage			// save steps 
	str	r3,[r10,#4]

	mov	r12,r2
	

     
	ands	r3,r1,#3
	beq	blitNormalBlend50Aligned_TestDest
	ands	r3,r0,#3
	beq	blitNormalBlend50DestAligned_Start
	b       blitNormalBlend50Aligned_Start
	
blitNormalBlend50Aligned_TestDest:
	ands	r3,r0,#3
	bne	blitNormalBlend50SourceAligned_Start

blitNormalBlend50Aligned_Start:

	str 	r12,[r10,#8]	
	ldr	r12,=0xF7DEF7DE
	mov	r12,r12,lsr #1


blitNormalBlend50Aligned_Line:

	ands	r6,r0,#3
	beq     blitNormalBlend50Aligned_copyoct
	ldrh	r4,[r1],#2
	ldrh	r5,[r0]
	and	r4,r12,r4,lsr #1
	and	r5,r12,r5,lsr #1
	add	r5,r4,r5
	
	strh	r5,[r0],#2
        sub	r2,r2, #1


blitNormalBlend50Aligned_copyoct:

	movs	r3,r2, lsr #3	
	beq	blitNormalBlend50Aligned_copywords
		
blitNormalBlend50Aligned_octcopy:     

	ldmia	r1!,{r4-r7}
	ldmia	r0,{r8-r11}
	
	and	r4,r12,r4,lsr #1
	and	r8,r12,r8,lsr #1
	add	r8,r4,r8
	
	and	r5,r12,r5,lsr #1
	and	r9,r12,r9,lsr #1
	add	r9,r5,r9

	and	r6,r12,r6,lsr #1
	and	r10,r12,r10,lsr #1
	add	r10,r6,r10

	and	r7,r12,r7,lsr #1
	and	r11,r12,r11,lsr #1
	add	r11,r7,r11


	subs	r3,r3, #1
	stmia	r0!,{r8-r11}     	
	bne	blitNormalBlend50Aligned_octcopy     
  
blitNormalBlend50Aligned_copywords:
	
	mov	r5,r2,lsr #1	
	ands	r5,r5,#3
	beq	blitNormalBlend50Aligned_pixelcopy
		
blitNormalBlend50Aligned_wordcopy:    

       	ldr	r3, [r1], #4
	ldr	r4,[r0]
	and	r3,r12,r3,lsr #1
	and	r4,r12,r4,lsr #1	
	add	r4,r4,r3

	subs	r5,r5, #1  	
	str	r4, [r0], #4      
	bne 	blitNormalBlend50Aligned_wordcopy

blitNormalBlend50Aligned_pixelcopy:

	ands	r2,r2, #1
	beq	blitNormalBlend50Aligned_done

blitNormalBlend50Aligned_copypixels:

	ldrh	r3,[r1], #2
	ldrh	r4,[r0]
	and	r3,r12,r3,lsr #1
	and	r4,r12,r4,lsr #1
	add	r4,r4,r3
	
	strh	r4,[r0], #2

blitNormalBlend50Aligned_done:

        
	ldr	r11,=storage
	add	r11,r11,#4
	ldr	r10,[r11],#4			// load step of the source
	ldr	r2,[r11]
	
	add	r1,r1,r10
	add	r0,r0,lr
	
	
	subs	r13,r13,#1
	bne	blitNormalBlend50Aligned_Line			// vertical loop

blitNormalBlend50Aligned_exit:

	ldr	r3,=storage
	ldr	r13,[r3]

	ldr	r0,=0
        ldmia   sp!, { r4 - r12, pc }


// Source only aligned version ****************


blitNormalBlend50SourceAligned_Start:

	str	lr,[r10,#8]
	str 	r12,[r10,#12]	
	ldr	r12,=0xF7DEF7DE
	mov	r12,r12,lsr #1


blitNormalBlend50SourceAligned_Line:

	movs	r3,r2, lsr #3	
	beq	blitNormalBlend50SourceAligned_copywords
		
blitNormalBlend50SourceAligned_octcopy:     

	ldmia	r1!,{r4-r7}

	ldrh	r8,[r0]	
	and	r9,r12,r4,lsr #1
	and	r8,r12,r8,lsr #1
	add	r8,r9,r8	
	strh	r8, [r0], #2  
		
	mov	lr,r5, lsl #16
	orr	r4,lr,r4,lsr #16
	mov	lr,r6,lsl #16
	orr	r5,lr,r5,lsr #16
	mov	lr,r7, lsl #16
	orr	r6,lr,r6,lsr #16
	
	ldmia	r0,{r8-r11}
	
	and	r4,r12,r4,lsr #1
	and	r8,r12,r8,lsr #1
	add	r8,r4,r8
	
	and	r5,r12,r5,lsr #1
	and	r9,r12,r9,lsr #1
	add	r9,r5,r9

	and	r6,r12,r6,lsr #1
	and	r10,r12,r10,lsr #1
	add	r10,r6,r10

	
	stmia	r0!,{r8-r10}  
	
	mov	r7,r7,lsr #16
	mov	r11,r11,lsr #16
	
	and	r7,r12,r7,lsr #1
	and	r11,r12,r11,lsr #1
	add	r11,r7,r11
			
	strh	r11,[r0],#2
	     
	subs	r3,r3, #1
	bne	blitNormalBlend50SourceAligned_octcopy     
  
blitNormalBlend50SourceAligned_copywords:

	
	mov	r5,r2,lsr #1
	ands	r5,r5,#3
	beq	blitNormalBlend50SourceAligned_pixelcopy
		
blitNormalBlend50SourceAligned_wordcopy:    

       	ldr	r3, [r1], #4
	
	ldrh	r4, [r0]
	ldrh	r6, [r0,#2]
	
	subs	r5,r5, #1    
	
	orr	r6,r4,r6,lsl #16
	
	and	r3,r12,r3,lsr #1
	and	r6,r12,r6,lsr #1
	add	r3,r6,r3
	
	strh	r3, [r0], #2
	mov	r3,r3,lsr #16
	strh	r3, [r0], #2         
	bne 	blitNormalBlend50SourceAligned_wordcopy

blitNormalBlend50SourceAligned_pixelcopy:

	ands	r2,r2, #1
	beq	blitNormalBlend50SourceAligned_done

blitNormalBlend50SourceAligned_copypixels:

	ldrh	r3,[r1], #2
	ldrh	r4,[r0]
	and	r3,r12,r3,lsr #1
	and	r4,r12,r4,lsr #1
	add	r3,r4,r3	
	strh	r3,[r0], #2


blitNormalBlend50SourceAligned_done:

        
	ldr	r11,=storage
	add	r11,r11,#4
	ldr	r10,[r11],#4			// load step of the source
	ldr 	r9,[r11],#4			// load step of the destination
	ldr	r2,[r11]
	
	add	r1,r1,r10
	add	r0,r0,r9
	
	
	subs	r13,r13,#1
	bne	blitNormalBlend50SourceAligned_Line			// vertical loop


	ldr	r3,=storage
	ldr	r13,[r3]

        ldr	r0,=7
        ldmia   sp!, { r4 - r12, pc }


// Destination only aligned version


blitNormalBlend50DestAligned_Start:
        str	lr,[r10,#8]
	str 	r12,[r10,#12]	
	ldr	r12,=0xF7DEF7DE
	mov	r12,r12,lsr #1
	


blitNormalBlend50DestAligned_Line:

	movs	r3,r2, lsr #3	
	beq	blitNormalBlend50DestAligned_copywords
		
blitNormalBlend50DestAligned_octcopy:     

	ldrh	r4,[r1], #2
	ldmia	r1!,{r5-r7}
	ldmia	r0, {r8-r11}
	
	orr	r4,r4,r5, lsl #16
		
	mov	lr, r5, lsr #16
	orr	r5,lr,r6, lsl #16
	mov	lr,r6,lsr #16
	orr	r6,lr,r7,lsl #16
	
	ldrh	lr,[r1],#2	
	mov	r7,r7, lsr #16
	orr	r7,r7,lr, lsl #16

	and	r4,r12,r4,lsr #1
	and	r8,r12,r8,lsr #1
	add	r8,r4,r8
	
	and	r5,r12,r5,lsr #1
	and	r9,r12,r9,lsr #1
	add	r9,r5,r9

	and	r6,r12,r6,lsr #1
	and	r10,r12,r10,lsr #1
	add	r10,r6,r10

	and	r7,r12,r7,lsr #1
	and	r11,r12,r11,lsr #1
	add	r11,r7,r11       

	stmia	r0!,{r8-r11}  
	     
	subs	r3,r3, #1
	bne	blitNormalBlend50DestAligned_octcopy     
  
blitNormalBlend50DestAligned_copywords:

	
	mov	r5,r2,lsr #1
	ands	r5,r5,#3
	beq	blitNormalBlend50DestAligned_pixelcopy
		
blitNormalBlend50DestAligned_wordcopy:    

	ldrh	r3, [r1], #2
	ldr	r6, [r0]
	subs	r5,r5, #1     		
	ldrh	r4, [r1], #2
	orr	r3,r3,r4, lsl #16
	
	and	r6,r12,r6,lsr #1
	and	r3,r12,r3,lsr #1
	add	r3,r6,r3

	str	r3,[r0],#4        
	bne 	blitNormalBlend50DestAligned_wordcopy

blitNormalBlend50DestAligned_pixelcopy:

	ands	r2,r2, #1
	beq	blitNormalBlend50DestAligned_done

blitNormalBlend50DestAligned_copypixels:

	ldrh	r3,[r1], #2
	ldrh	r6,[r0]
	and	r3,r12,r3,lsr #1
	and	r6,r12,r6,lsr #1
	add	r3,r6,r3
	
	strh	r3,[r0], #2

blitNormalBlend50DestAligned_done:

        
	ldr	r11,=storage
	add	r11,r11,#4
	ldr	r10,[r11],#4			// load step of the source
	ldr 	r9,[r11],#4			// load step of the destination
	ldr	r2,[r11]
	
	add	r1,r1,r10
	add	r0,r0,r9
		
	subs	r13,r13,#1
	bne	blitNormalBlend50DestAligned_Line			// vertical loop


	ldr	r3,=storage
	ldr	r13,[r3]

        ldr	r0,=2
        ldmia   sp!, { r4 - r12, pc }




 //----------------------------------------------------------------------------
/*
*
*	Fill surface with a single color value.
*	void fillNormalDestAligned(void *memory, int length, int color);
*	r0	- memory address
*	r1 	- length
*	r2	- color value	
*	
*/
//----------------------------------------------------------------------------
fillSurface:
	stmdb   sp!, { r4 - r12, lr }

	orr	r2,r2,r2, lsl #16

	mov	r4,r2
	mov	r5,r2
	mov	r6,r2
	mov	r7,r2
	mov	r8,r2
	mov	r9,r2
	mov	r10,r2
	mov	r11,r2

	movs	r3,r1, lsr #4	
	beq	fillSurface_copywords

fillSurface_octcopy:     

	stmia	r0!,{r4-r11}     
	subs	r3,r3, #1
	bne	fillSurface_octcopy

fillSurface_copywords:
	
	mov	r5,r1,lsr #1
	ands	r5,r5,#7
	beq	fillSurface_pixelcopy
		
fillSurface_wordcopy:    
	str	r4, [r0], #4
	subs	r5,r5, #1	
	bne 	fillSurface_wordcopy

fillSurface_pixelcopy:

	ands	r1,r1, #1
	beq	fillSurface_done

fillSurface_copypixels:
	strh	r4,[r0], #2


fillSurface_done:
	ldmia   sp!, { r4 - r12, pc }


//----------------------------------------------------------------------------
/*
*
*	Draw filled rectangle
*	
*	void fillRectAligned(void *str);
*	
*	str{	
*		0   =destination
*		+4  =source( undefined)
*		+8  =width of the source rectangle to be transfered
*		+12 =height of the source rectangle to be transfered
*		+16 =source step  (undefined)
*		+20 =destination step (DestTotalWidth -transferWidth) in bytes   
*		+24 =color 
*	}
*	
*/
//----------------------------------------------------------------------------

fillRectAligned:
	stmdb   sp!, { r4 - r12, lr }

	ldr	r3,=storage
	str	r13,[r3]

	mov 	r11,r0
	ldr	r0,[r11],#+4 			// load destination   	
	add	r11,r11,#4			// skip source
	ldr	r2,[r11],#+4			// load source width
	ldr	r13,[r11],#+4			// load source height
	add	r11,r11,#4			// skip source step
	ldr	lr,[r11],#4			// load destination step
	ldr	r4,[r11]			// load fill color
	orr	r4,r4,r4, lsl #16
	mov	r5,r4
	mov	r6,r4
	mov	r7,r4
	mov	r8,r4
	mov	r9,r4
	mov	r10,r4
	mov	r11,r4

	mov	r12,r2

fillRectAligned_Line:

	mov	r1,r0
	ands	r1,r1,#3
	beq     fillRectAligned_copyoct
	strh	r4,[r0],#2
        sub	r2,r2, #1


fillRectAligned_copyoct:

	movs	r3,r2, lsr #4	
	beq	fillRectAligned_copywords
		
fillRectAligned_octcopy:     
	
	stmia	r0!,{r4-r11}     
	subs	r3,r3, #1
	bne	fillRectAligned_octcopy     
  
fillRectAligned_copywords:
	
	mov	r1,r2,lsr #1
	ands	r1,r1,#7
	beq	fillRectAligned_pixelcopy
		
fillRectAligned_wordcopy:    

	str	r4, [r0], #4
	subs	r1,r1, #1	
	bne 	fillRectAligned_wordcopy

fillRectAligned_pixelcopy:

	ands	r2,r2, #1
	beq	fillRectAligned_done

fillRectAligned_copypixels:
	
	strh	r4,[r0], #2

fillRectAligned_done:
        
	add	r0,r0,lr
	
	mov	r2,r12
	
	subs	r13,r13,#1
	bne	fillRectAligned_Line			// vertical loop

fillRectAligned_exit:

	ldr	r3,=storage
	ldr	r13,[r3]


        ldmia   sp!, { r4 - r12, pc }
	

//----------------------------------------------------------------------------
/*
*
*	Draw filled rectangle
*	
*	void fillRectBlend50Aligned(void *str);
*	
*	str{	
*		0   =destination
*		+4  =source( undefined)
*		+8  =width of the source rectangle to be transfered
*		+12 =height of the source rectangle to be transfered
*		+16 =source step  (undefined)
*		+20 =destination step (DestTotalWidth -transferWidth) in bytes   
*		+24 =color 
*	}
*	
*/
//----------------------------------------------------------------------------

fillRectBlend50Aligned:
	stmdb   sp!, { r4 - r12, lr }

	ldr	r3,=storage
	str	r13,[r3],#+4

	mov 	r11,r0
	ldr	r0,[r11],#+4 			// load destination   	
	add	r11,r11,#4			// skip source
	ldr	r2,[r11],#+4			// load source width
	ldr	r13,[r11],#+4			// load source height
	add	r11,r11,#4			// skip source step
	ldr	lr,[r11],#4			// load destination step
	ldr	r4,[r11]			// load fill color
	orr	r4,r4,r4, lsl #16
		
	str	r2,[r3],#+4			// store source width
	str	lr,[r3],#+4				// store destination step
        str	r13,[r3]	

	mov	lr,r4
	ldr	r12,=0xF7DEF7DE
	and	lr,r12,lr
	mov	lr,lr,lsr #1

fillRectBlend50Aligned_Line:

	mov	r1,r0
	ands	r1,r1,#3
	beq     fillRectBlend50Aligned_copyoct
	ldrh	r13,[r0]
	and	r13,r12,r13
	add	r13,lr,r13, lsr #1
	strh	r13,[r0],#2
        sub	r2,r2, #1


fillRectBlend50Aligned_copyoct:

	movs	r3,r2, lsr #4	
	beq	fillRectBlend50Aligned_copywords
		
fillRectBlend50Aligned_octcopy:     
	
	ldmia	r0,{r4-r11}
	and	r4,r12,r4
	add	r4,lr,r4, lsr #1
	and	r5,r12,r5
	add	r5,lr,r5, lsr #1
	and	r6,r12,r6
	add	r6,lr,r6, lsr #1
	and	r7,r12,r7
	add	r7,lr,r7, lsr #1
	and	r8,r12,r8
	add	r8,lr,r8, lsr #1
	and	r9,r12,r9
	add	r9,lr,r9, lsr #1
	and	r10,r12,r10
	add	r10,lr,r10, lsr #1
	and	r11,r12,r11
	add	r11,lr,r11, lsr #1
					
	stmia	r0!,{r4-r11}     
	subs	r3,r3, #1
	bne	fillRectBlend50Aligned_octcopy     
  
fillRectBlend50Aligned_copywords:
	
	mov	r1,r2,lsr #1
	ands	r1,r1,#7
	beq	fillRectBlend50Aligned_pixelcopy
		
fillRectBlend50Aligned_wordcopy:    

	ldr	r4, [r0]
	subs	r1,r1, #1 
	and	r4,r12,r4
	add	r4,lr,r4, lsr #1	
	str	r4, [r0], #4	      
	bne 	fillRectBlend50Aligned_wordcopy

fillRectBlend50Aligned_pixelcopy:

	ands	r2,r2, #1
	beq	fillRectBlend50Aligned_done

fillRectBlend50Aligned_copypixels:

	ldrh	r4,[r0]
	and	r4,r12,r4
	add	r4,lr,r4, lsr #1
	strh	r4,[r0],#2


fillRectBlend50Aligned_done:
        
	ldr	r3,=storage
	add	r3,r3,#4
	ldr	r2,[r3],#+4
	ldr	r5,[r3],#+4
	
	ldr	r13,[r3]
	add	r0,r0,r5
	
	subs	r13,r13,#1
	str	r13,[r3]
	bne	fillRectBlend50Aligned_Line			// vertical loop

fillRectBlend50Aligned_exit:

	ldr	r3,=storage
	ldr	r13,[r3]


        ldmia   sp!, { r4 - r12, pc }


//----------------------------------------------------------------------------
/*
*
*	Draw filled rectangle
*	
*	void fillRectBlendAligned(void *str);
*	
*	str{	
*		0   =destination
*		+4  =source( undefined)
*		+8  =width of the source rectangle to be transfered
*		+12 =height of the source rectangle to be transfered
*		+16 =source step  (undefined)
*		+20 =destination step (DestTotalWidth -transferWidth) in bytes   
*		+24 =color 
*	}
*	
*/
//----------------------------------------------------------------------------

fillRectBlendAligned:
	stmdb   sp!, { r4 - r12, lr }

	ldr	r3,=storage
	str	r13,[r3],#+4

	mov 	r11,r0

	ldr	r0,[r11]			// destination
	ldr	r2,[r11,#8]			// rectangle width
	ldr	r13,[r11,#12]			// rectangle height
	ldr	r4,[r11,#20]			// destination step
	ldr	r5,[r11,#24]			// fill color
	ldr	r12,[r11,#32]			// alpha 

	str	r2,[r3],#+4			// store source width
	str	r4,[r3],#+4				// store destination step
        str	r13,[r3]

			
	orr	r5,r5,r5,lsl #16		// upper pixel needs to have the same fill color
			   
	mov	r8,r12,lsl #1				
	orr	r9,r8,r12,lsl #12
	orr	r8,r9,r8,lsl #21		// r8 = alpha*2 shifted and ready for each GRB component		

	mov	r9,r12,lsl #1
	orr	r10,r9,r12,lsl #12
	orr	lr,r10,r12,lsl #23		// r9 = (alpha*2 >> 5) shifted and ready for  each RBG component
				  			
	ldr	r9,=0x7E0F81F			// GRB part mask
	ldr	r10,=0xF81F07E0			// RBG part mask

	and	r6,r5,r9			
	ldr	r1,=0x8020040			
	add	r6,r6,r1			// masked out GRB component of the fill color + 64
	
	ldr	r1,=0x10020040			
	and	r7,r5,r10			// r7 = source component y
	add	r7,r1,r7,lsr #5                 // masked out RBG component of the fill color + 64
				       
						// r1 is free 					       
		
fillRectBlendAligned_line:

	mov	r3,r0				// check for misaligned write
	ands	r3,r3,#3
	beq     fillRectBlendAligned_copywords

	ldrh	r5, [r0]
	
	and	r13,r5,r9			// x part  	        
	sub	r4,r6,r13      	  		//  (sx+64) - dx
	mul	r4,r12,r4			//  alpha * ((sx+64) - dx)	
	add	r4,r13,r4,lsr #5		//  (((alpha * ((sx+64) -dx )) >>5) + dx)
	sub	r4,r4,r8			//  (((alpha * ((sx+64) -dx) >> 5) +dx )) - alpha*2	
	and	r4,r4,r9
	  			
	and	r13,r5,r10			//  y part		
	sub	r3,r7,r13,lsr #5		//  (sy + 64) - dx	
	mul	r3,r12,r3			//  alpha * ((sy+64)-dy)
	mov	r13,r13,lsr #5
	add	r3,r13,r3,lsr #5		//  (((alpha * ((sy+64) -dy )) >>5) + dx
	sub	r3,r3,lr	
	and	r3,r10,r3,lsl #5

	orr	r5,r3,r4
  	sub	r2,r2,#1
	strh	r5, [r0],#2	

fillRectBlendAligned_copywords:
	
	movs	r11,r2,lsr #1
	beq     fillRectBlendAligned_pixelcopy
		
    
fillRectBlendAligned_wordcopy:    

	ldr	r5, [r0]
	
	and	r13,r5,r9			// x part  	        
	sub	r4,r6,r13      	  		//  (sx+64) - dx
	mul	r4,r12,r4			//  alpha * ((sx+64) - dx)	
	add	r4,r13,r4,lsr #5		//  (((alpha * ((sx+64) -dx )) >>5) + dx)
	sub	r4,r4,r8			//  (((alpha * ((sx+64) -dx) >> 5) +dx )) - alpha*2	
	and	r4,r4,r9
	  			
	and	r13,r5,r10			//  y part		
	sub	r3,r7,r13,lsr #5		//  (sy + 64) - dx	
	mul	r3,r12,r3			//  alpha * ((sy+64)-dy)
	mov	r13,r13,lsr #5
	add	r3,r13,r3,lsr #5		//  (((alpha * ((sy+64) -dy )) >>5) + dx
	sub	r3,r3,lr	
	and	r3,r10,r3,lsl #5

	orr	r5,r3,r4

	subs	r11,r11, #1 
	str	r5, [r0],#4    
	bne 	fillRectBlendAligned_wordcopy


fillRectBlendAligned_pixelcopy:

	ands	r2,r2, #1
	beq	fillRectBlendAligned_done


	ldrh	r5, [r0]
	
	and	r13,r5,r9			// x part  	        
	sub	r4,r6,r13      	  		//  (sx+64) - dx
	mul	r4,r12,r4			//  alpha * ((sx+64) - dx)	
	add	r4,r13,r4,lsr #5		//  (((alpha * ((sx+64) -dx )) >>5) + dx)
	sub	r4,r4,r8			//  (((alpha * ((sx+64) -dx) >> 5) +dx )) - alpha*2	
	and	r4,r4,r9
	  			
	and	r13,r5,r10			//  y part		
	sub	r3,r7,r13,lsr #5		//  (sy + 64) - dx	
	mul	r3,r12,r3			//  alpha * ((sy+64)-dy)
	mov	r13,r13,lsr #5
	add	r3,r13,r3,lsr #5		//  (((alpha * ((sy+64) -dy )) >>5) + dx
	sub	r3,r3,lr	
	and	r3,r10,r3,lsl #5

	orr	r5,r3,r4
	strh	r5, [r0],#2

fillRectBlendAligned_done:
        
	ldr	r3,=storage
	add	r3,r3,#4
	ldr	r2,[r3],#+4
	ldr	r5,[r3],#+4
	
	ldr	r13,[r3]
	add	r0,r0,r5
	
	subs	r13,r13,#1
	str	r13,[r3]
	bne	fillRectBlendAligned_line			// vertical loop

fillRectBlendAligned_exit:

	ldr	r3,=storage
	ldr	r13,[r3]
        ldmia   sp!, { r4 - r12, pc }
	


//----------------------------------------------------------------------------
/*
*
*	Block transfer with blending ( 32 levels of alpha)
*	
*	void blitNormalBlendAligned(void *str);
*	
*	str{	
*		0   =destination
*		+4  =source( undefined)
*		+8  =width of the source rectangle to be transfered
*		+12 =height of the source rectangle to be transfered
*		+16 =source step  (undefined)
*		+20 =destination step (DestTotalWidth -transferWidth) in bytes   
*		+24 =color 
*	}
*	
*/
//----------------------------------------------------------------------------


blitNormalBlendAligned:
	stmdb   sp!, { r4 - r12, lr }

	ldr	r3,=storage
	str	r13,[r3],#+4

	mov 	r11,r0

	ldr	r0,[r11]			// destination
	ldr	r1,[r11,#4]			// source
	ldr	r2,[r11,#8]			// source width
	ldr	r13,[r11,#12]			// source height
	ldr	r10,[r11,#16]			// source step	
	ldr	r4,[r11,#20]			// destination step
	ldr	r12,[r11,#32]			// alpha 

	str	r2,[r3],#+4			// store source width
	str	r4,[r3],#+4			// store destination step
	str	r10,[r3],#+4			// store source step
        str	r13,[r3]			// store height
			   
	mov	r8,r12,lsl #1				
	orr	r9,r8,r12,lsl #12
	orr	r8,r9,r8,lsl #21		// r8 = alpha*2 shifted and ready for each GRB component		

	mov	r9,r12,lsl #1
	orr	r10,r9,r12,lsl #12
	orr	lr,r10,r12,lsl #23		// lr = (alpha*2 >> 5) shifted and ready for  each RBG component
				  			
	ldr	r9,=0x7E0F81F			// GRB part mask
	ldr	r10,=0xF81F07E0			// RBG part mask
     	 
     
	ands	r3,r1,#3
	beq	blitNormalBlend_TestDest
	ands	r3,r0,#3
	beq	blitNormalBlendDestAligned_copywords
	b       blitNormalBlendAligned_line
blitNormalBlend_TestDest:
	ands	r3,r0,#3
	bne	blitNormalBlendSourceAligned_copywords


//  Both destination and source ( or niether)  aligned version  

	
blitNormalBlendAligned_line:

	ands	r3,r0,#3
	beq     blitNormalBlendAligned_copywords

	ldrh	r7, [r1],#2
	ldrh	r5, [r0]

	ldr	r3,=0x8020040 
	and	r6,r7,r9
	add	r6,r6,r3
		

	and	r13,r5,r9			// x part  	        
	sub	r4,r6,r13      	  		//  (sx+64) - dx
	mul	r4,r12,r4			//  alpha * ((sx+64) - dx)	
	add	r4,r13,r4,lsr #5		//  (((alpha * ((sx+64) -dx )) >>5) + dx)
	sub	r4,r4,r8			//  (((alpha * ((sx+64) -dx) >> 5) +dx )) - alpha*2	
	and	r4,r4,r9

	ldr	r3,=0x10020040 	  	       	
	and	r7,r7,r10
	add	r7,r3,r7,lsr #5
 	
	and	r13,r5,r10			//  y part		
	sub	r3,r7,r13,lsr #5		//  (sy + 64) - dx	
	mul	r3,r12,r3			//  alpha * ((sy+64)-dy)
	mov	r13,r13,lsr #5
	add	r3,r13,r3,lsr #5		//  (((alpha * ((sy+64) -dy )) >>5) + dx
	sub	r3,r3,lr	
	and	r3,r10,r3,lsl #5

	orr	r5,r3,r4
	
	strh	r5, [r0],#2
  	sub	r2,r2,#1

blitNormalBlendAligned_copywords:
	
	
	movs	r11,r2,lsr #1
	beq     blitNormalBlendAligned_pixelcopy
		
    
blitNormalBlendAligned_wordcopy:    

	ldr	r7, [r1],#4			// load source pixel
	ldr	r5, [r0]			// load dest pixel	
	
	ldr	r3,=0x8020040 		
	and	r6,r7,r9	
	add	r6,r6,r3

	and	r13,r5,r9			// x part  	        
	sub	r4,r6,r13      	  		//  (sx+64) - dx
	mul	r4,r12,r4			//  alpha * ((sx+64) - dx)	
	add	r4,r13,r4,lsr #5		//  (((alpha * ((sx+64) -dx )) >>5) + dx)
	sub	r4,r4,r8			//  (((alpha * ((sx+64) -dx) >> 5) +dx )) - alpha*2	
	and	r4,r4,r9

	ldr	r3,=0x10020040		  		
	and	r7,r7,r10	 	
	add	r7,r3,r7,lsr #5
	
	
	and	r13,r5,r10			//  y part		
	sub	r3,r7,r13,lsr #5		//  (sy + 64) - dx	
	mul	r3,r12,r3			//  alpha * ((sy+64)-dy)
	mov	r13,r13,lsr #5
	add	r3,r13,r3,lsr #5		//  (((alpha * ((sy+64) -dy )) >>5) + dx
	sub	r3,r3,lr	
	and	r3,r10,r3,lsl #5

	orr	r5,r3,r4

	str	r5, [r0],#4
	subs	r11,r11, #1	
	bne 	blitNormalBlendAligned_wordcopy


blitNormalBlendAligned_pixelcopy:

	ands	r2,r2, #1
	beq	blitNormalBlendAligned_done

        ldrh	r7, [r1],#2
	ldrh	r5, [r0]
	
	ldr	r3,=0x8020040 
	and	r6,r7,r9
	add	r6,r6,r3


	and	r13,r5,r9			// x part  	        
	sub	r4,r6,r13      	  		//  (sx+64) - dx
	mul	r4,r12,r4			//  alpha * ((sx+64) - dx)	
	add	r4,r13,r4,lsr #5		//  (((alpha * ((sx+64) -dx )) >>5) + dx)
	sub	r4,r4,r8			//  (((alpha * ((sx+64) -dx) >> 5) +dx )) - alpha*2	
	and	r4,r4,r9
	  	   
	ldr	r3,=0x10020040
	and	r7,r7,r10 
	add	r7,r3,r7,lsr #5

     	
	and	r13,r5,r10			//  y part		
	sub	r3,r7,r13,lsr #5		//  (sy + 64) - dx	
	mul	r3,r12,r3			//  alpha * ((sy+64)-dy)
	mov	r13,r13,lsr #5
	add	r3,r13,r3,lsr #5		//  (((alpha * ((sy+64) -dy )) >>5) + dx
	sub	r3,r3,lr	
	and	r3,r10,r3,lsl #5

	orr	r5,r3,r4
	strh	r5, [r0],#2

blitNormalBlendAligned_done:
        
	ldr	r3,=storage
	ldr	r2,[r3,#4]
	ldr	r5,[r3,#8]
	
        ldr	r4,[r3,#12]
	ldr	r13,[r3,#16]
	add	r0,r0,r5
	add	r1,r1,r4
	

	subs	r13,r13,#1
	str	r13,[r3,#16]
	bne	blitNormalBlendAligned_line			// vertical loop

blitNormalBlendAligned_exit:

	ldr	r3,=storage
	ldr	r13,[r3]
        ldmia   sp!, { r4 - r12, pc }


//  Source Aligned version				       
											       


blitNormalBlendSourceAligned_copywords:
	
	movs	r11,r2,lsr #1
	beq     blitNormalBlendSourceAligned_pixelcopy
		
    
blitNormalBlendSourceAligned_wordcopy:    
  			
	ldrh	r3, [r0]		        // load dest pixel
	ldrh	r5, [r0,#2]                    // load dest pixel
	ldr	r7, [r1],#4			// load source pixels 	
	orr	r5,r3,r5,lsl #16	
	
	ldr	r3,=0x8020040 	
	and	r6,r7,r9	
	add	r6,r6,r3

	and	r13,r5,r9			// x part  	        
	sub	r4,r6,r13      	  		//  (sx+64) - dx
	mul	r4,r12,r4			//  alpha * ((sx+64) - dx)	
	add	r4,r13,r4,lsr #5		//  (((alpha * ((sx+64) -dx )) >>5) + dx)
	sub	r4,r4,r8			//  (((alpha * ((sx+64) -dx) >> 5) +dx )) - alpha*2	
	and	r4,r4,r9

	ldr	r3,=0x10020040	  		
	and	r7,r7,r10	 	
	add	r7,r3,r7,lsr #5
	
	
	and	r13,r5,r10			//  y part		
	sub	r3,r7,r13,lsr #5		//  (sy + 64) - dx	
	mul	r3,r12,r3			//  alpha * ((sy+64)-dy)
	mov	r13,r13,lsr #5
	add	r3,r13,r3,lsr #5		//  (((alpha * ((sy+64) -dy )) >>5) + dx
	sub	r3,r3,lr	
	and	r3,r10,r3,lsl #5

	orr	r5,r3,r4

	strh	r5, [r0],#2
	mov	r5,r5,lsr #16
	strh	r5, [r0],#2
	subs	r11,r11, #1	
	bne 	blitNormalBlendSourceAligned_wordcopy


blitNormalBlendSourceAligned_pixelcopy:

	ands	r2,r2, #1
	beq	blitNormalBlendSourceAligned_done

        ldrh	r7, [r1],#2
	ldrh	r5, [r0]
	
	ldr	r3,=0x8020040 
	and	r6,r7,r9
	add	r6,r6,r3


	and	r13,r5,r9			// x part  	        
	sub	r4,r6,r13      	  		//  (sx+64) - dx
	mul	r4,r12,r4			//  alpha * ((sx+64) - dx)	
	add	r4,r13,r4,lsr #5		//  (((alpha * ((sx+64) -dx )) >>5) + dx)
	sub	r4,r4,r8			//  (((alpha * ((sx+64) -dx) >> 5) +dx )) - alpha*2	
	and	r4,r4,r9
	  	   
	ldr	r3,=0x10020040
	and	r7,r7,r10 
	add	r7,r3,r7,lsr #5

     	
	and	r13,r5,r10			//  y part		
	sub	r3,r7,r13,lsr #5		//  (sy + 64) - dx	
	mul	r3,r12,r3			//  alpha * ((sy+64)-dy)
	mov	r13,r13,lsr #5
	add	r3,r13,r3,lsr #5		//  (((alpha * ((sy+64) -dy )) >>5) + dx
	sub	r3,r3,lr	
	and	r3,r10,r3,lsl #5

	orr	r5,r3,r4
	strh	r5, [r0],#2

blitNormalBlendSourceAligned_done:
        
	ldr	r3,=storage
	ldr	r2,[r3,#4]
	ldr	r5,[r3,#8]
	
        ldr	r4,[r3,#12]
	ldr	r13,[r3,#16]
	add	r0,r0,r5
	add	r1,r1,r4
	

	subs	r13,r13,#1
	str	r13,[r3,#16]
	bne	blitNormalBlendSourceAligned_copywords			// vertical loop

blitNormalBlendSourceAligned_exit:

	ldr	r3,=storage
	ldr	r13,[r3]
        ldmia   sp!, { r4 - r12, pc }

				       
											       
//  Destination Aligned version  


blitNormalBlendDestAligned_copywords:
	
	movs	r11,r2,lsr #1
	beq     blitNormalBlendDestAligned_pixelcopy
		
    
blitNormalBlendDestAligned_wordcopy:    

	ldrh	r3, [r1],#2		        
	ldrh	r7, [r1],#2                     
	ldr	r5, [r0]
	orr	r7,r3,r7,lsl #16 			        
	       
	
	ldr	r3,=0x8020040 	
	and	r6,r7,r9	
	add	r6,r6,r3

	and	r13,r5,r9			// x part  	        
	sub	r4,r6,r13      	  		//  (sx+64) - dx
	mul	r4,r12,r4			//  alpha * ((sx+64) - dx)	
	add	r4,r13,r4,lsr #5		//  (((alpha * ((sx+64) -dx )) >>5) + dx)
	sub	r4,r4,r8			//  (((alpha * ((sx+64) -dx) >> 5) +dx )) - alpha*2	
	and	r4,r4,r9

	ldr	r3,=0x10020040	  		
	and	r7,r7,r10	 	
	add	r7,r3,r7,lsr #5
	
	
	and	r13,r5,r10			//  y part		
	sub	r3,r7,r13,lsr #5		//  (sy + 64) - dx	
	mul	r3,r12,r3			//  alpha * ((sy+64)-dy)
	mov	r13,r13,lsr #5
	add	r3,r13,r3,lsr #5		//  (((alpha * ((sy+64) -dy )) >>5) + dx
	sub	r3,r3,lr	
	and	r3,r10,r3,lsl #5

	orr	r5,r3,r4

	str	r5, [r0],#4
	subs	r11,r11, #1	
	bne 	blitNormalBlendDestAligned_wordcopy


blitNormalBlendDestAligned_pixelcopy:

	ands	r2,r2, #1
	beq	blitNormalBlendDestAligned_done

        ldrh	r7, [r1],#2
	ldrh	r5, [r0]
	
	ldr	r3,=0x8020040 
	and	r6,r7,r9
	add	r6,r6,r3


	and	r13,r5,r9			// x part  	        
	sub	r4,r6,r13      	  		//  (sx+64) - dx
	mul	r4,r12,r4			//  alpha * ((sx+64) - dx)	
	add	r4,r13,r4,lsr #5		//  (((alpha * ((sx+64) -dx )) >>5) + dx)
	sub	r4,r4,r8			//  (((alpha * ((sx+64) -dx) >> 5) +dx )) - alpha*2	
	and	r4,r4,r9
	  	   
	ldr	r3,=0x10020040
	and	r7,r7,r10 
	add	r7,r3,r7,lsr #5

     	
	and	r13,r5,r10			//  y part		
	sub	r3,r7,r13,lsr #5		//  (sy + 64) - dx	
	mul	r3,r12,r3			//  alpha * ((sy+64)-dy)
	mov	r13,r13,lsr #5
	add	r3,r13,r3,lsr #5		//  (((alpha * ((sy+64) -dy )) >>5) + dx
	sub	r3,r3,lr	
	and	r3,r10,r3,lsl #5

	orr	r5,r3,r4
	strh	r5, [r0],#2

blitNormalBlendDestAligned_done:
        
	ldr	r3,=storage
	ldr	r2,[r3,#4]
	ldr	r5,[r3,#8]
	
        ldr	r4,[r3,#12]
	ldr	r13,[r3,#16]
	add	r0,r0,r5
	add	r1,r1,r4
	

	subs	r13,r13,#1
	str	r13,[r3,#16]
	bne	blitNormalBlendDestAligned_copywords			// vertical loop

blitNormalBlendDestAligned_exit:

	ldr	r3,=storage
	ldr	r13,[r3]
        ldmia   sp!, { r4 - r12, pc }




