Add a first attempt at waitloop skipping

This change introduces a first attempt at waitloop detection and
skipping. Games like Pokemon Emerald and Kirby Nightmare in Dreamland
don't properly halt the CPU when waiting for an event like VBLANK, and
instead just burn the CPU in a loop.

This change specifically focuses on those games. Other games likely use
more instructions than I added support for here, and other games likely
also may use a waitloop technique that this approach can't prove to be
free of side-effects. Further changes will add support for more
instructions, but I don't expect to add support for games that wait in a
more complicated way. (This emulator is for my own enjoyment, after all,
and it supports the games I want to play :p)

Prior to this change, Pokemon Ruby saw ~430 fps on my M2 MacBook Air,
while Pokemon Emerald and Kirby both saw ~290 fps. With this change, all
games now see ~430 fps, including Ruby which didn't see any regression.
For this reason, I've enabled waitloop skipping by default.

In subsequent changes, I plan to add support for more instructions, add
proper configuration in the UI (enable/disable, set instruction limit,
etc), as well as potentially add a CPU meter to better visualize the
impact of this change.
This commit is contained in:
Matthew Berry 2022-12-03 11:48:33 -08:00
parent 591c8b41c3
commit f6d88f82fa
5 changed files with 308 additions and 3 deletions

View file

@ -9,6 +9,7 @@ class GBAController < Controller
@debug_window = false
@scheduler_window = false
@experimental_settings = false
def initialize(config : Config, bios : String?, rom : String)
@emu = GBA::GBA.new(bios || config.gba.bios, rom, config.run_bios)
@ -26,6 +27,7 @@ class GBAController < Controller
def render_debug_items : Nil
ImGui.menu_item("Video", "", pointerof(@debug_window))
ImGui.menu_item("Scheduler", "", pointerof(@scheduler_window))
ImGui.menu_item("Experimental Settings", "", pointerof(@experimental_settings))
end
def render_windows : Nil
@ -54,6 +56,12 @@ class GBAController < Controller
end
end
end
if @experimental_settings
ImGui.window("Experimental Settings", pointerof(@experimental_settings)) do
ImGui.checkbox("Attempt waitloop detection", pointerof(@emu.cpu.attempt_waitloop_detection))
ImGui.checkbox("Cache waitloop results", pointerof(@emu.cpu.cache_waitloop_results))
end
end
end
private def render_palettes_tab_item : Nil

View file

@ -106,7 +106,7 @@ module GBA
end
@[AlwaysInline]
private def read_half_internal(address : UInt32) : UInt16
def read_half_internal(address : UInt32) : UInt16
address &= ~1
case bits(address, 24..27)
when 0x0 then (@bios.to_unsafe + (address & 0x3FFF)).as(UInt16*).value

View file

@ -1,11 +1,13 @@
require "./arm/*"
require "./thumb/*"
require "./pipeline"
require "./waitloop"
module GBA
class CPU
include ARM
include THUMB
include Waitloop
CLOCK_SPEED = 2**24
@ -142,7 +144,12 @@ module GBA
arm_execute instr
end
cycles, @gba.bus.cycles = @gba.bus.cycles, 0
@gba.scheduler.tick cycles
if @entered_waitloop
@gba.scheduler.fast_forward
@entered_waitloop = false
else
@gba.scheduler.tick cycles
end
else
@gba.scheduler.fast_forward
end

View file

@ -3,8 +3,10 @@ module GBA
def thumb_conditional_branch(instr : UInt32) : Nil
cond = bits(instr, 8..11)
offset = bits(instr, 0..7).to_i8!.to_i32
branch_dest = @r[15] &+ (offset * 2)
analyze_loop(branch_dest, @r[15] - 4) # this instruction exists at @r[15] - 4
if check_cond cond
set_reg(15, @r[15] &+ (offset * 2))
set_reg(15, branch_dest)
else
step_thumb
end

288
src/crab/gba/waitloop.cr Normal file
View file

@ -0,0 +1,288 @@
module GBA
module Waitloop
# Whether waitloop detection should be attempted.
property attempt_waitloop_detection : Bool = true
# Whether to try caching successful and unsuccessful branch destinations.
property cache_waitloop_results : Bool = true
# The previous branch destination.
@branch_dest = 0_u32
# Collection of branch destinations identified as waitloops.
@identified_waitloops = Array(UInt32).new
# Collection of branch destinations identified as non-waitloops.
@identified_non_waitloops = Array(UInt32).new
# Flags when a waitloop is detected. Used by the CPU to fast-forward.
@entered_waitloop = false
# Table to quickly look up an instruction's class.
getter waitloop_instr_lut : Slice(Instruction.class) { build_lut }
# Attempt to detect a waitloop. Assumes thumb instructions.
def analyze_loop(start_addr : UInt32, end_addr : UInt32) : Nil
return unless @attempt_waitloop_detection
return unless start_addr == @branch_dest
return unless start_addr < end_addr && 2 <= end_addr - start_addr <= 8 # only analyze up to 4 thumb instruction
if @cache_waitloop_results
if @identified_waitloops.includes?(start_addr)
@entered_waitloop = true
return
end
return if @identified_non_waitloops.includes?(start_addr)
end
written_bits = never_write = 0_u16
(start_addr...end_addr).step(2) do |addr|
instr = @gba.bus.read_half_internal(addr)
parsed_instr = waitloop_instr_lut[instr >> 8].parse?(instr)
unless parsed_instr && parsed_instr.read_only?
@identified_non_waitloops.push(start_addr) if @cache_waitloop_results
return
end
never_write |= parsed_instr.read_bits & ~written_bits
if written_bits & never_write > 0 # first write to a register was after a read, which could indicate an impure loop.
@identified_non_waitloops.push(start_addr) if @cache_waitloop_results
return
end
written_bits |= parsed_instr.write_bits
end
@identified_waitloops.push(start_addr) if @cache_waitloop_results
@entered_waitloop = true
ensure
@branch_dest = start_addr
end
def build_lut : Slice(Instruction.class)
Slice(Instruction.class).new(256) do |idx|
case
when idx & 0b11110000 == 0b11110000 then LongBranchLink
when idx & 0b11111000 == 0b11100000 then UnconditionalBranch
when idx & 0b11111111 == 0b11011111 then SoftwareInterrupt
when idx & 0b11110000 == 0b11010000 then ConditionalBranch
when idx & 0b11110000 == 0b11000000 then MultipleLoadStore
when idx & 0b11110110 == 0b10110100 then PushPopRegisters
when idx & 0b11111111 == 0b10110000 then AddOffsetToStackPointer
when idx & 0b11110000 == 0b10100000 then LoadAddress
when idx & 0b11110000 == 0b10010000 then SpRelativeLoadStore
when idx & 0b11110000 == 0b10000000 then LoadStoreHalfword
when idx & 0b11100000 == 0b01100000 then LoadStoreImmediateOffset
when idx & 0b11110010 == 0b01010010 then LoadStoreSignExtended
when idx & 0b11110010 == 0b01010000 then LoadStoreRegisterOffset
when idx & 0b11111000 == 0b01001000 then PcRelativeLoad
when idx & 0b11111100 == 0b01000100 then HighRegBranchExchange
when idx & 0b11111100 == 0b01000000 then AluOperations
when idx & 0b11100000 == 0b00100000 then MoveCompareAddSubtract
when idx & 0b11111000 == 0b00011000 then AddSubtract
when idx & 0b11100000 == 0b00000000 then MoveShiftedRegister
else Unimplemented
end
end
end
abstract struct Instruction
# Attempt to parse the instruction. Nilable to support unimplemented insts.
def self.parse?(instruction : UInt16) : Instruction?
end
# Indicates that this instruction doesn't attempt to write to storage.
def read_only?
false
end
# Each set bit indicates a register this instruction reads from.
def read_bits : UInt16
0xFF_u16
end
# Each set bit indicates a register this instruction writes to.
def write_bits : UInt16
0xFF_u16
end
end
struct LongBranchLink < Instruction
end
struct UnconditionalBranch < Instruction
end
struct SoftwareInterrupt < Instruction
end
struct ConditionalBranch < Instruction
def initialize(@cond : UInt16, @offset : Int32)
end
def read_only? : Bool
true
end
def read_bits : UInt16
0_u16
end
def write_bits : UInt16
0_u16
end
def self.parse?(instr : UInt16) : ConditionalBranch
cond = bits(instr, 8..11)
offset = bits(instr, 0..7).to_i8!.to_i32
new(cond, offset)
end
end
struct MultipleLoadStore < Instruction
end
struct PushPopRegisters < Instruction
end
struct AddOffsetToStackPointer < Instruction
end
struct LoadAddress < Instruction
end
struct SpRelativeLoadStore < Instruction
end
struct LoadStoreHalfword < Instruction
def initialize(@load : Bool, @offset : UInt16, @rb : UInt16, @rd : UInt16)
end
def read_only? : Bool
@load
end
def read_bits : UInt16
res = 1_u16 << @rb
res |= 1_u16 << @rd unless @load
res
end
def write_bits : UInt16
if @load
1_u16 << @rd
else
0_u16
end
end
def self.parse?(instr : UInt16) : LoadStoreHalfword
load = bit?(instr, 11)
offset = bits(instr, 6..10)
rb = bits(instr, 3..5)
rd = bits(instr, 0..2)
new(load, offset, rb, rd)
end
end
struct LoadStoreImmediateOffset < Instruction
end
struct LoadStoreSignExtended < Instruction
end
struct LoadStoreRegisterOffset < Instruction
end
struct PcRelativeLoad < Instruction
end
struct HighRegBranchExchange < Instruction
end
struct AluOperations < Instruction
def initialize(@op : UInt16, @rs : UInt16, @rd : UInt16)
end
def read_only? : Bool
true
end
def read_bits : UInt16
1_u16 << @rs | 1_u16 << @rd
end
def write_bits : UInt16
return 0_u16 if @op == 0b1000_u16 || @op == 0b1010_u16 || @op == 0b1011_u16
1_u16 << @rd
end
def self.parse?(instr : UInt16) : AluOperations
op = bits(instr, 6..9)
rs = bits(instr, 3..5)
rd = bits(instr, 0..2)
new(op, rs, rd)
end
end
struct MoveCompareAddSubtract < Instruction
def initialize(@op : UInt16, @rd : UInt16, @offset : UInt16)
end
def read_only? : Bool
true
end
def read_bits : UInt16
return 0_u16 if @op == 0
1_u16 << @rd
end
def write_bits : UInt16
return 0_u16 if @op == 1
1_u16 << @rd
end
def self.parse?(instr : UInt16) : MoveCompareAddSubtract
op = bits(instr, 11..12)
rd = bits(instr, 8..10)
offset = bits(instr, 0..7)
new(op, rd, offset)
end
end
struct AddSubtract < Instruction
def initialize(@imm_flag : Bool, @sub : Bool, @imm_or_rn : UInt16, @rs : UInt16, @rd : UInt16)
end
def read_only? : Bool
true
end
def read_bits : UInt16
res = 1_u16 << @rs
res |= 1_u16 << @imm_or_rn unless @imm_flag
res
end
def write_bits : UInt16
1_u16 << @rd
end
def self.parse?(instr : UInt16) : AddSubtract
imm_flag = bit?(instr, 10)
sub = bit?(instr, 9)
imm_or_rn = bits(instr, 6..8)
rs = bits(instr, 3..5)
rd = bits(instr, 0..2)
new(imm_flag, sub, imm_or_rn, rs, rd)
end
end
struct MoveShiftedRegister < Instruction
end
struct Unimplemented < Instruction
end
end
end