Improved performance of line_down.s before loop.

2025-03-26 22:00:09 +01:00 · 2025-03-26 22:00:09 +01:00 · 801cd55541
commit 801cd55541
parent 89c9bc4129
2 changed files with 51 additions and 43 deletions
--- a/wip-hugo/macros/16aritmatic.s
+++ b/wip-hugo/macros/16aritmatic.s
@ -78,8 +78,13 @@
        ROL low_
 	ROL hi_
 .endif
 .endmacro
-
+.macro Mov_16 a_low, a_hi, b_low, b_hi
        LDA b_low
        STA a_low
        LDA b_hi
        STA a_hi
 .endmacro
 ;;Larger then operation, uses the A register
--- a/wip-hugo/routines/line/line_down.s
+++ b/wip-hugo/routines/line/line_down.s
@ -9,7 +9,26 @@
 ;;NOTE THAT X_pos <= X_end, Y_pos <= Y_end. Max 45deg!
 .proc line_down
-        .include "line.inc"; Defines memory positions, ex X_pos
+
        ;; TEMPORARY
        ;; Hack because changing dx and dy makes other line draws bugg and idk why
        ;; This is offcorse temporary
        ;.include "line.inc"; Defines memory positions, ex X_pos
        X_end = $04
        Y_end = $05
        X_pos = $FC
        Y_pos = $FB
        dx = $0c
        dy = $06
        dy_2 = $0607
        dx_2 = dy_2
        V = $0809
        D = $0a0b
        ;;These are also used in pixel_draw. Look there to find out more
        byte_to_paint = $FE ;Byte with one 1 that corasponds to a pixel.
        btp_mem_pos =$494A; byte to paint memory position ;Position of byte on screen
        ;;END TEMPORARY
        ;;We need to clear this memory
        LDA #$00
        STA <V
@ -18,18 +37,13 @@
        ;; V = 2*(dx -dy)
        SEC
        LDA Y_end
        SBC Y_pos
        STA >V
        STA >dy_2;  >dy_2 = dy. Needed for dy_2 (not for V)
        LDA dx
-        SEC
+        SBC dy
-        SBC >V
+        STA >V
-        STA >V; <V = dx - dy
+        mult_16 >V, <V
        mult_16 >V, <V; V = 2*(dx -dy)
        ;dy_2 = dy*2
-        mult_16 >dy_2, <dy_2, !
+        mult_16 >dy_2, <dy_2 ;>dy_2 = dy (same address)
        ;; This is an Bresenham's line algorithm, se wikipedia bellow.
        ;;https://en.wikipedia.org/wiki/Bresenham%27s_line_algorithm
@ -42,44 +56,33 @@
        ;; and to its branch logic later in the loop.
        ;;D = 2*dy - dx + 2*255
-        LDA >dy_2
+        Mov_16 >D, <D, >dy_2, <dy_2
        STA >D
        LDA <dy_2
        STA <D
        Add_16 >D, <D, #$ff, #$01, !
        Sub_16 >D, <D, dx, #$00
        jsr pixel_draw ;;only used first pixel. after this relative position is abused
        LDX X_pos
-        ;; Self modifying code. Makes LDA instructions take 1 cycle less.
+        selfmod:
-        ;; Code will run without this but slower!
+        ;; Self modifying code. Makes LDA and SBC instructions each take 1 cycle less.
-        ;; Modifies LDA instructions for dy_2 and SBC for V
+        ;; You can remove this if you run the loop without # at dy_2 and V.
        ;;Note: The offsets like +2 etc is because there are instructions betwean the label and the
        ;address that needs to be modified
-
+        ;; dy_2
-        ;;; dy_2
+        ;; Modifies LDA >dy_2
-        ;;; Modifies LDA >dy_2
+        LDA >dy_2
-        ;LDA #$A9 ; LDA (immediate)
+        STA case_2 +1
-        ;STA case_2
+        ;; Modifies LDA <dy_2
-        ;LDA >dy_2
+        LDA <dy_2
-        ;STA case_2 +1
+        STA case_2 +7
-        ;;; Modifies LDA <dy_2
+        ;; V
-        ;LDA #$A9 ; LDA (immediate)
+        ;;Modidies SBC >V
-        ;STA case_2 +6 ; ADC is +2 bytes, STA is + 2 bytes, Offset from before is +2 bytes.
+        LDA >V
-        ;LDA <dy_2
+        STA case_1 +3
-        ;STA case_2 +7
+        ;; Modifies SBC <V
-        ;;; V
+        LDA <V
-        ;;;Modidies SBC >V
+        STA case_1 +9
-        ;LDA #$E9 ;SBC (immediate)
+        end_selfmod:
        ;STA case_1 +2;LDA is +2
        ;LDA >V
        ;STA case_1 +3
        ;;; Modifies SBC <V
        ;LDA #$E9 ;SBC (immediate)
        ;STA case_1 +8; Offset before +4 bytes, STA +2, LDA +2.
        ;LDA <V
        ;STA case_1 +9
 for_x:
        ;; Paints A to address in |btp_mem_pos* + Y|
@ -104,7 +107,7 @@ increment_pixel_x_end:
        ;;else case 1.
        Lag_16 >D, <D, #$00, #$02, case_2
 case_1:; C =1 so we can use !
-        Sub_16 >D, <D, >V, <V, ! ; D = D - V
+        Sub_16 >D, <D, #>V, #<V, ! ; D = D - V
 increment_y_pos:
        INY ; Increment Y pos inside the buffer
        CPY #$08
@ -116,7 +119,7 @@ move_8px_down: ; Z=1 --> C=1
        JMP for_x
 increment_y_pos_end:
 case_2: ;; C =0 because LAG_16 so we can use !
-        Add_16 >D, <D, >dy_2, <dy_2, ! ;D = D + 2*dy
+        Add_16 >D, <D, #>dy_2, #<dy_2, ! ;D = D + 2*dy
        JMP for_x
 end:
        RTS