; ****************************************************************************
;
;                                   Video
;
; ****************************************************************************

#include "include.inc"

	.text

	.align 6		; align font to 64 bytes
.global Font
Font:
#include "font\font.S"	// original ZX81 font
;#include "font\font2.S"	// custom font
;#include "font\font3.S"	// IBM font


	.balign 2	; to avoid linker error "warning: internal error: out of range error" must be at the end of unaligned tables


; video configuration tables (in ROM)
.global VConfVGA
VConfVGA:
	.word	(F_CPU+15743)/31469-1	; VC_HEND (u16) end of horizontal line (=HCYCLES-1, -> ICR1) (=762)
	.word	(F_CPU+131121)/262242-1	; VC_HSYNCEND (u16) end of horizontal synchronization pulse (=HSYNCLEN-1, -> OCR1A) (=91)
	.word	525			; VC_VLINES (u16) vertical lines total (=VLINES, -> VLines)
	.word	525-81-2 - (24-HEIGHT)*16/2 ; VC_VSYNCBEG (u16) start of VSYNC (=VLINES-VBP-VSYNCLEN, -> VSyncBeg) (
	.word	525-81 - (24-HEIGHT)*16/2 ; VC_VSYNCEND (u16) end of VSYNC (=VLINES-VBP, -> VSyncEnd)

.global VConfPAL
VConfPAL:
	.word	(F_CPU+7845)/15690-1	; VC_HEND (u16) end of horizontal line (=HCYCLES-1, -> ICR1) (=1529)
	.word	(F_CPU+106013)/212027-1	; VC_HSYNCEND (u16) end of horizontal synchronization pulse (=HSYNCLEN-1, -> OCR1A) (=112)
	.word	312			; VC_VLINES (u16) vertical lines total (=VLINES, -> VLines)
	.word	312-67-3 - (24-HEIGHT)*8/2 ; VC_VSYNCBEG (u16) start of VSYNC (=VLINES-VBP-VSYNCLEN, -> VSyncBeg)
	.word	312-67 - (24-HEIGHT)*8/2 ; VC_VSYNCEND (u16) end of VSYNC (=VLINES-VBP, -> VSyncEnd)

.global VConfNTSC
VConfNTSC:
	.word	(F_CPU+7845)/15690-1	; VC_HEND (u16) end of horizontal line (=HCYCLES-1, -> ICR1) (=1529)
	.word	(F_CPU+106013)/212027-1	; VC_HSYNCEND (u16) end of horizontal synchronization pulse (=HSYNCLEN-1, -> OCR1A) (=112)
	.word	262			; VC_VLINES (u16) vertical lines total (=VLINES, -> VLines)
	.word	262-40-3 - (24-HEIGHT)*8/2 ; VC_VSYNCBEG (u16) start of VSYNC (=VLINES-VBP-VSYNCLEN, -> VSyncBeg)
	.word	262-40 - (24-HEIGHT)*8/2 ; VC_VSYNCEND (u16) end of VSYNC (=VLINES-VBP, -> VSyncEnd)

.global VConf
VConf:
	.word	VConfVGA	; 0: (SEL+SEL2) VGA2 ... not used
	.word	VConfVGA	; 1: (SEL2) VGA
	.word	VConfNTSC	; 2: (SEL1) NTSC
	.word	VConfPAL	; 3: (-) PAL

; ----------------------------------------------------------------------------
;                      Get current frame counter
; ----------------------------------------------------------------------------
; OUTPUT: R25:R24 = current frame counter
; DESTROYS: R23, R22
; STACK: 2 bytes
; ----------------------------------------------------------------------------
; - can be used as timer with resolution 20 ms and period 21 minutes
; - incremented immediately after last line of the image, can be used to start to VSync period

.global GetFrame
GetFrame:
	ldd	r24,Y+DATA_FRAME
	ldd	r25,Y+DATA_FRAME+1
GetFrame2:
	ldd	r22,Y+DATA_FRAME
	ldd	r23,Y+DATA_FRAME+1
	cp	r22,r24
	cpc	r23,r25
	movw	r24,r22
	brne	GetFrame2
	ret

; ----------------------------------------------------------------------------
;                         Wait for next video frame
; ----------------------------------------------------------------------------
; DESTROYS: R25..R20
; STACK: 4 bytes
; ----------------------------------------------------------------------------
; - can be used to start to VSync period
; - one frame takes 20 ms (PAL with 50 fps) or 17 ms (VGA and NTSC with 60 fps)

.global WaitFrame
WaitFrame:
	rcall	GetFrame	; get current video frame
	movw	r20,r24
WaitFrame2:
	rcall	GetFrame	; get current video frame
	cp	r24,r20
	cpc	r25,r21
	breq	WaitFrame2
WaitFrame4:
	ret	

; ----------------------------------------------------------------------------
;                         Wait for number of frames
; ----------------------------------------------------------------------------
; INPUT: R25:R24 = number of frames
; STACK: 6 bytes
; ----------------------------------------------------------------------------
; - one frame takes 20 ms (PAL with 50 fps) or 17 ms (VGA and NTSC with 60 fps)

.global WaitFrameNum
WaitFrameNum:
	sbiw	r24,0
	breq	WaitFrameNum4

	push	r28
	push	r29
	movw	r28,r24
WaitFrameNum2:
	rcall	WaitFrame	; Wait for next video frame (destroys R25..R20)
	sbiw	r28,1
	brne	WaitFrameNum2
	pop	r29
	pop	r28

WaitFrameNum4:
	ret

; ----------------------------------------------------------------------------
;  Update videomode (called from interrupt, from init and from height update)
; ----------------------------------------------------------------------------
; DESTROYS: R31, R30, R25, R24, R1, R0
; STACK: 2 bytes
; ----------------------------------------------------------------------------

.global DispUpdate
DispUpdate:

; ----- load selected display mode

	in	r30,_SFR_IO_ADDR(PIND)	; read port D
	rol	r30
	rol	r30
	rol	r30
	andi	r30,B0+B1	; mask display mode 0..3

; ----- check if display mode has been changed

	ldd	r31,Y+DATA_DISPMODE ; read display mode
	cp	r31,r30		; check display mode
	breq	WaitFrame4	; display mode not changed

; ----- set new display mode

	std	Y+DATA_DISPMODE,r30

; ----- visible vertical lines

	ldi	r24,8 		; videolines per row in TV modes
	ldi	r31,0		; SPI2X register in TV mode
	cpi	r30,DISP_TV	; TV mode ?
	brcc	1f		; TV mode (PAL or NTSC)
	ldi	r24,16 		; videolines per row in VGA mode
	ldi	r31,BIT(SPI2X)	; SPI2X register in VGA mode

1:	ldd	r25,Y+DATA_DISPROWS ; number of displayed rows
	mul	r25,r24		; R1:R0 <- number of visible videolines
	std	Y+DATA_VVISIBLE,r0
	std	Y+DATA_VVISIBLE+1,r1

; ----- set SPI2X clock

	out	_SFR_IO_ADDR(SPSR),r31

; ----- prepare configuration table -> Z

	add	r30,r30		; display mode * 2
	clr	r31
	subi	r30,lo8(-(VConf))
	sbci	r31,hi8(-(VConf))

	lpm	r24,Z+
	lpm	r25,Z
	movw	r30,r24

; ----- set horizontal timer

	lpm	r24,Z+
	lpm	r25,Z+		; VC_HEND (u16) end of horizontal line (=HCYCLES-1, -> ICR1)
#ifdef MCU8
	out	_SFR_IO_ADDR(ICR1H),r25
	out	_SFR_IO_ADDR(ICR1L),r24	; set Timer1 ICR1
#else
	sts	ICR1H,r25
	sts	ICR1L,r24	; set Timer1 ICR1
#endif
	
; ----- set sound timer

	lsr	r25
	ror	r24		; cycles per line / 2
#ifdef MCU8
	out	_SFR_IO_ADDR(OCR1BH),r25
	out	_SFR_IO_ADDR(OCR1BL),r24 ; set Timer1 OCR1B
#else
	sts	OCR1BH,r25
	sts	OCR1BL,r24	; set Timer1 OCR1B
#endif

; ----- set length of horizontal pulse

	lpm	r24,Z+
	lpm	r25,Z+		; VC_HSYNCEND (u16) end of horizontal synchronization pulse (=HSYNCLEN-1, -> OCR1A)
#ifdef MCU8
	out	_SFR_IO_ADDR(OCR1AH),r25
	out	_SFR_IO_ADDR(OCR1AL),r24 ; set Timer1 OCR1A
#else
	sts	OCR1AH,r25
	sts	OCR1AL,r24	; set Timer1 OCR1A
#endif

; ----- reset Timer1 counter

#ifdef MCU8
	out	_SFR_IO_ADDR(TCNT1H),R_ZERO
	out	_SFR_IO_ADDR(TCNT1L),R_ZERO ; set Timer1 TCNT1
#else
	sts	TCNT1H,R_ZERO
	sts	TCNT1L,R_ZERO	; set Timer1 TCNT1
#endif

; ----- total vertical lines

	lpm	r24,Z+
	lpm	r25,Z+		; VC_VLINES (u16) vertical lines total (=VLINES, -> VLines)
	std	Y+DATA_VLINES,r24
	std	Y+DATA_VLINES+1,r25

; ----- start of VSYNC

	lpm	r24,Z+
	lpm	r25,Z+		; VC_VSYNCBEG (u16) start of VSYNC (=VLINES-VBP-VSYNCLEN, -> VSyncBeg)
	std	Y+DATA_VSYNCBEG,r24
	std	Y+DATA_VSYNCBEG+1,r25

; ----- end of VSYNC

	lpm	r24,Z+
	lpm	r25,Z+		; VC_VSYNCEND (u16) end of VSYNC (=VLINES-VBP, -> VSyncEnd)
	std	Y+DATA_VSYNCEND,r24
	std	Y+DATA_VSYNCEND+1,r25
	ret

; ----------------------------------------------------------------------------
;          Video rendering continue - render VSYNC and black lines
; ----------------------------------------------------------------------------
; Local variables:
#define SAVEXL R20 // save XL
#define SAVEXH R21 // save XH
#define LINEOFF R22 // line offset of the tile
#define TILECNT R23 // counter of tiles on row
#define TMP R24 // temporary, counter, must be LOW to TMP2
#define TMP2 R25 // alternative temporary, character delay, must be HIGH to TMP
;  R27:R26 (XH:XL) pointer to tile index in RAM
;     (Y register is global, do not use local)
;  R31:R30 (ZH:ZL) pointer to tile in ROM
#define VLINEL R30 // ZL, VLine low, must be LOW to VLINEH
#define VLINEH R31 // ZH, VLine high, must be HIGH to VLINEL

; Duplicated:

; [40] Sound output and key input
SndOutKeyIn:

; [19] sound output (OC1B is disconnected)

	; [4] read OCR1B register -> TMP2:TMP
	lds	TMP,OCR1BL	; [2] get current sound level
	lds	TMP2,OCR1BH	; [2] ... read from L must be before H

	; [6] increment value
	ldd	ZL,Y+DATA_SNDINC ; [2] sound phase increment
	ldd	ZH,Y+DATA_SNDINC+1 ; [2]

	add	TMP,ZL		; [1] shift sound phase
	adc	TMP2,ZH		; [1]

	; [4] write new value
	sts	OCR1BH,TMP2	; [2] write to H must be before L
	sts	OCR1BL,TMP	; [2]

	; [5] output to port - use increased value / 65536
	in	TMP2,_SFR_IO_ADDR(PORTB) ; [1]
	ldi	TMP,B2		; [1]
	brcc	1f		; [1,2]
	eor	TMP2,TMP	; [1] flip output bit state
1:	out	_SFR_IO_ADDR(PORTB),TMP2 ; [1] sound output

; [21] Keyboard input

	; [5, 17]
#ifdef MCU8
	lds	TMP,UCSRA		; [2] status register
	sbrs	TMP,RXC			; [1,2,3] receive complete?
	rjmp	2f			; [2] no key
	lds	TMP,UDR			; [2] receive character
#else
	lds	TMP,UCSR0A		; [2] status register
	sbrs	TMP,RXC0		; [1,2,3] receive complete?
	rjmp	2f			; [2] no key
	lds	TMP,UDR0		; [2] receive character
#endif
	ldd	ZL,Y+DATA_KEYWRITE	; [2] write index
	clr	ZH			; [1] ZH <- 0
	subi	ZL,lo8(-(KeyBuf))	; [1]
	sbci	ZH,hi8(-(KeyBuf))	; [1] address in keyboard buffer
	st	Z,TMP			; [2] store received key
	subi	ZL,lo8(KeyBuf-1)	; [1] ZL <- new write index
	andi	ZL,KEYBUF_MASK		; [1] mask write index
	std	Y+DATA_KEYWRITE,ZL	; [2] save new write index

	; [4]
	ret				; [4]

	; [12]  no key
2:	ldi	TMP,4			; [1]
3:	dec	TMP			; [1]
	brne	3b			; [1,2]

	; [4]
	ret				; [4]

; Time: [53, 64]

VRow2:
	; [8-10] start VSYNC pulse
	ldd	TMP,Y+DATA_VSYNCBEG ; [2]
	ldd	TMP2,Y+DATA_VSYNCBEG+1 ; [2]
	cp	VLINEL,TMP	; [1]
	cpc	VLINEH,TMP2	; [1]
	brne	VRow3		; [1,2]

	ldi	TMP,BIT(COM1A1) | BIT(WGM11) ; [1]
#ifdef MCU8
	out	_SFR_IO_ADDR(TCCR1A),TMP ; [1] start VSYNC pulse (invert SYNC signal)
#else
	sts	TCCR1A,TMP	; [2] start VSYNC pulse (invert SYNC signal)
#endif


; Time: [61-63, 72]

VRow3:
	; [8-10] stop VSYNC pulse
	ldd	TMP,Y+DATA_VSYNCEND ; [2]
	ldd	TMP2,Y+DATA_VSYNCEND+1	; [2]
	cp	VLINEL,TMP	; [1]
	cpc	VLINEH,TMP2	; [1]
	brne	VRow4		; [1,2]

	ldi	TMP,BIT(COM1A1) | BIT(COM1A0) | BIT(WGM11) ; [1]
#ifdef MCU8
	out	_SFR_IO_ADDR(TCCR1A),TMP ; [1] stop VSYNC pulse (invert SYNC signal)
#else
	sts	TCCR1A,TMP	; [2] stop VSYNC pulse (invert SYNC signal)
#endif


; Time: [69-73, 80]
	
VRow4:
	; [7,8] end of frame
	ldd	TMP,Y+DATA_VLINES ; [2]
	ldd	TMP2,Y+DATA_VLINES+1 ; [2]
	cp	VLINEL,TMP	; [1]
	cpc	VLINEH,TMP2	; [1]
	brne	VRow5		; [1,2]

; Time: [84-88]

	; [5, 9, 12] sound length counter
	ldd	TMP,Y+DATA_SNDLEN ; [2] get sound length counter
	tst	TMP		; [1] already zero?
	breq	1f		; [1,2] counter is already zero
	dec	TMP		; [1]
	std	Y+DATA_SNDLEN,TMP ; [2]
	brne	1f		; [1,2] end of tone?
	std	Y+DATA_SNDINC,TMP ; [2] clear increment (= quiet sound)
	std	Y+DATA_SNDINC+1,TMP ; [2]
1:
	; [2] clear VLine (preset it to -1)
	ldi	VLINEL,-1		; [1]
	ldi	VLINEH,-1		; [1]

; Time: [77-92]

VRow5:	; [6] increment vertical line and save it
	adiw	VLINEL,1		; [2] increment videoline
	std	Y+DATA_VLINE,VLINEL	; [2]
	std	Y+DATA_VLINE+1,VLINEH	; [2]

; Time: [83-98]

	; [43] Sound output and key input
	rcall	SndOutKeyIn		; [43]

; time from black lines: [126-141]

RenderStop:

	; [27] pop registers
	pop	SAVEXH			; [2]
	pop	SAVEXL			; [2]
	pop	ZH			; [2]
	pop	ZL			; [2]
	pop	TILECNT			; [2]
	pop	LINEOFF			; [2]
	pop	XH			; [2]
	pop	XL			; [2]
	pop	TMP2			; [2]
	pop	TMP			; [2]
	out	_SFR_IO_ADDR(SREG),TMP	; [1]
	pop	TMP			; [2]
	reti				; [4]

; service of black lines: 128-147 clock cycles (= about 10-20% of CPU time)


; Time: [56] (here is Z flag = equal to first invisible line)

VRow1:
	brne	VRow2			; [1,2]

; Time: [52]

	; [12] increment current frame
	ldd	TMP,Y+DATA_FRAME	; [2]
	ldd	TMP2,Y+DATA_FRAME+1	; [2]
	adiw	TMP,1			; [2]
	std	Y+DATA_FRAME,TMP	; [2]
	std	Y+DATA_FRAME+1,TMP2	; [2]
	rjmp	VRow2			; [2]

; ----------------------------------------------------------------------------
;                           Video rendering
; ----------------------------------------------------------------------------
; Interrupt occurs at start of HSYNC pulse.
; Stack requirements: 15 bytes

; Interrupt latency 6..9 cycles:
;  0..3 cycles for finishing currently executed instruction
;  4 cycles for pushing PC on stack and jump to interrupt vector table
;  2 cycles for the rjmp

; read TCNT1L: value will be 6..9 + 7 + 2 = 15..18 cycles

#define PUSHCLK 12 	// clock synchronization

.global	TIMER1_OVF_vect
TIMER1_OVF_vect:

; ===== push registers

; Time: [6..9] (relative to start of HSYNC pulse)

	; [5] push registers 1
	push	TMP			; [2]
	in	TMP,_SFR_IO_ADDR(SREG)	; [1] status register
	push	TMP			; [2]

; Time: [11..14]

	; [8..11] sync interrupt latency
	;in	TMP,_SFR_IO_ADDR(TCNT1L); [1] MCU8 faster alternative, not supported by ATmega88
	lds	TMP,TCNT1L		; [2] get Timer1 counter
	cpi	TMP,PUSHCLK+1		; [1] ? 15
	brcs	.			; [1,2] add 1 for 14
	cpi	TMP,PUSHCLK+2		; [1] ? 16
	brcs	.			; [1,2] add 1 for 14,15
	cpi	TMP,PUSHCLK+3		; [1] ? 17
	brcs	.			; [1,2] add 1 for 14,15,16

; Time: [22]

	; [18] push registers 2
	push	TMP2			; [2]
	push	XL			; [2]
	push	XH			; [2]
	push	LINEOFF			; [2]
	push	TILECNT			; [2]
	push	ZL			; [2]
	push	ZH			; [2]
	push	SAVEXL			; [2]
	push	SAVEXH			; [2]

; ----- [2] pointer into video RAM

	ldi	XL,lo8(Board)		; [1]
	ldi	XH,hi8(Board)		; [1]

; ===== prepare vertical line

RenderStart:

; Time: [42]

	; [2] clear old interrupt request
	ldi	TMP,BIT(TOV1)	; [1]
#ifdef MCU8
	out	_SFR_IO_ADDR(TIFR),TMP ; [1] clear interrupt request
#else
	out	_SFR_IO_ADDR(TIFR1),TMP ; [1] clear interrupt request
#endif

	; [4] load current vertical line
	ldd	VLINEL,Y+DATA_VLINE	; [2]
	ldd	VLINEH,Y+DATA_VLINE+1	; [2]

; Time: [48]

	; [7, 8] check visible lines
	ldd	TMP,Y+DATA_VVISIBLE	; [2]
	ldd	TMP2,Y+DATA_VVISIBLE+1	; [2] number of visible lines
	cp	VLINEL,TMP		; [1]
	cpc	VLINEH,TMP2		; [1]
	brcc	VRow1			; [1,2] jump if line is not visible (VSYNC)

; Time: [55]

	; [6] increment video line and save it (as next video line)
	adiw	VLINEL,1		; [2] increment video line
	std	Y+DATA_VLINE,VLINEL	; [2]
	std	Y+DATA_VLINE+1,VLINEH	; [2]

; Time: [61]

; ===== [23 TV, 26 VGA] prepare pointers and other stuffs

#define HCENTER_TV	(62+(32-WIDTH)*36/6)	// WIDTH 30: =74
#define HCENTER_VGA	(14+(32-WIDTH)*18/6)	// WIDTH 30: =20

	; [3] save current videoline
	sbiw	VLINEL,1		; [2] return video line
	mov	ZH,VLINEL		; [1] ZH <- save current videoline

	; [8 TV, 11 VGA] prepare delay periods and character line
	ldd	TMP,Y+DATA_DISPMODE	; [2] current display mode (0 normal, 1 split, 2 double, 3 half)
	cpi	TMP,DISP_TV		; [1] TV mode?
	ldi	TMP,HCENTER_TV		; [1] horizontal back porch in TV mode
	ldi	TMP2,7			; [1] character delay in TV mode
	ldi	LINEOFF,7		; [1] line mask in TV mode
	brcc	1f			; [1,2] skip if TV mode

	ldi	TMP,HCENTER_VGA		; [1] horizontal back porch in VGA mode
	ldi	TMP2,1			; [1] character delay in VGA mode
	lsr	ZH			; [1] videoline / 2
	ldi	LINEOFF,0xf		; [1] line mask in VGA mode

	; [4] prepare board address of current row
1:	and	VLINEL,LINEOFF		; [1] check zero line
	brne	2f			; [1,2] not zero line
	movw	SAVEXL,XL		; [1] save board address
2:	movw	XL,SAVEXL		; [1] load board address

	; [6] line offset of the tile -> ZH:LINEOFF
	andi	ZH,7			; [1] mask sprite line index 0..7
	ldi	LINEOFF,0		; [1]
	lsr	ZH			; [1]
	ror	LINEOFF			; [1]
	lsr	ZH			; [1] line index / 4 (64 characters in font, 1 line = 64 bytes)
	ror	LINEOFF			; [1]

	; [2] add font address -> ZH:LINEOFF
	subi	LINEOFF,lo8(-(Font))	; [1] font
	sbci	ZH,hi8(-(Font))		; [1]

; Time: [84 TV, 87 VGA]

; ===== [224 TV, 62 VGA] horizontal back porch (3*TMP-1)

1:	dec	TMP			; [1]
	brne	1b			; [1,2]

; Time: [305 TV, 146 VGA]

; ----- [7,8] read number of tiles per row and limit to WIDTH -> TILECNT
; Limitation to WIDTH is needed during operations inside video memory - content may be temporary invalid

	ld	TILECNT,X+		; [2] read number of visible tiles
	cpi	TILECNT,WIDTH+1		; [1] check max. width
	brcs	1f			; [1,2]
	ldi	TILECNT,WIDTH		; [1] limit max. width
1:	tst	TILECNT			; [1] empty row?
	brne	RenderLoop1		; [1,2] render tiles

; ----- [4] first character will be a space

	ldi	TILECNT,1		; [1] number of visible characters = 1
	ldi	ZL,CH_SPC		; [1] space will be only one visible character
	rjmp	RenderLoop2		; [2]

; Time: [313 TV, 154 VGA]

; On input:
;   X = pointer into video RAM
;   ZH = address of font table HIGH
;   TMP2 = delay
;   TILECNT = character counter - 1
; Locals:
;   ZL = temporary pointer LOW
;   TMP = delay counter

; ===== [19] prepare first tile

; ----- [3] load first character from VRAM -> R31:R30

RenderLoop1:
	nop				; [1]
	ld	ZL,X+			; [2] get character from video buffer

; ----- [4] push number of visible tiles and prepare first character

RenderLoop2:
	push	TILECNT			; [2] push number of visible tiles per row
	bst	ZL,7			; [1] T <- save bit 7 (inverze flag)
	andi	ZL,0x3f			; [1] clear bit 7 (inverze flag) and bit 6

; ----- [4] load font pattern (font address must be aligned to 128 bytes)

	add	ZL,LINEOFF		; [1] add font address LOW (+ videoline)
	lpm	r30,Z			; [3] load font pattern

; ----- [2] inverze pattern

	brts	1f			; [1,2] skip if T is set (inverze flag is set)
	com	ZL			; [1] complement (inverze image)
1:

; ----- [2] SPI enable

	ldi	TMP,SPIEN		; [1]
	out	_SFR_IO_ADDR(SPCR),TMP	; [1] SPI enabled

; ----- time 0: [1] output font pattern

	out	_SFR_IO_ADDR(SPDR),ZL	; [1] output font pattern to SPI register

; ----- time 1: [3] skip in case of 1 visible character

	dec	TILECNT			; [1] decrease character counter
	breq	RenderLoop4		; [1,2] only 1 character
	nop				; [1]

; Start time: [332 TV, 173 VGA]

; ===== [1044 TV, 522 VGA] display 29 characters

RenderLoop:	; 18 or 36 clocks per loop

; ----- time 4: [4] load character from VRAM

	ld	ZL,X+			; [2] get character from video buffer
	bst	ZL,7			; [1] T <- save bit 7 (inverze flag)
	andi	ZL,0x3f			; [1] clear bit 7 (inverze flag) and bit 6

; ----- time 8: [4] load font pattern (font address must be aligned to 64 bytes)

	add	ZL,LINEOFF		; [1] add font address LOW (+ videoline)
	lpm	ZL,Z			; [3] load font pattern

; ----- time 12: [2] inverze pattern

	brts	1f			; [1,2] skip if T is set (inverze flag is set)
	com	ZL			; [1] complement (inverze image)
1:

; ----- time 14: [4 or 22] delay

	mov	TMP,TMP2		; [1] time constant 1 or 7
2:	dec	TMP			; [1] counter
	brne	2b			; [1,2] wait
	nop				; [1]

; ----- time 0: [1] output font pattern

	out	_SFR_IO_ADDR(SPDR),ZL	; [1] output font pattern to SPI register

; ----- time 1: [3] next character

	dec	TILECNT			; [1] character counter
	brne	RenderLoop		; [1,2]
	nop				; [1]

; Time: [1373 TV, 692 VGA]

; ===== [33 TV, 15 VGA] wait for last character

; ----- time 4: [5] number of invisible tiles

RenderLoop4:
	pop	TMP			; [2] pop visible tiles per row
	ldi	TILECNT,WIDTH+1		; [1] number of total tiles per row + 1
	sub	TILECNT,TMP		; [1] invisible tiles + 1
	ldi	TMP,0			; [1] next delay

; ----- first pass: time 9 [5 or 23], next pass: time 0 [14 or 32] delay to wait invisible or last character

2:	nop2				; [2]
	add	TMP,TMP2		; [1] delay 0,3 + time constant 1 or 7
3:	dec	TMP			; [1]
	brne	3b			; [1,2] wait

; ----- time 14 or 32: [3,4] next invisibile tile

	ldi	TMP,3			; [1] next delay
	dec	TILECNT			; [1] tile counter
	brne	2b			; [1,2] next invisible tile

; ----- time 17 or 35: [2] SPI disable (set output to black)

	ldi	TMP,SPIDIS		; [1]
	out	_SFR_IO_ADDR(SPCR),TMP	; [1] SPI disabled

; Time: [1406 TV, 707 VGA]

; ===== [121 TV, 53 VGA] horizontal front porch

	ldd	TMP,Y+DATA_DISPMODE	; [2] current display mode (0 normal, 1 split, 2 double, 3 half)
	cpi	TMP,DISP_TV		; [1] TV mode?
	ldi	TMP,87 - HCENTER_TV + (32-WIDTH)*36/3 ; [1] delay in TV mode (WIDTH 30: =37)
	brcc	4f			; [1,2] skip if TV mode
	ldi	TMP,22 - HCENTER_VGA + (32-WIDTH)*18/3 ; [1] delay in VGA mode (WIDTH 30: =14)
	nop				; [1]
4:	dec	TMP			; [1]
	brne	4b			; [1,2]
	nop2				; [2]

; Time: [1580 TV, 760 VGA]

; ===== [45] sound and keyboard

	; [43] Sound output and key input
	rcall	SndOutKeyIn		; [43]

	; [2] next videoline
	rjmp	RenderStart		; [2]

; jump Time: [1625 TV, 805 VGA], destination time should be [42]
;    loop time: [1583 TV, 763 VGA] ... bad calculations! :-) (should be 1529 and 762)

