LeanMDPs.Histories

theorem MDPs.exph_congr {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : ℕ} (X Y : MDPs.Hist M → ℝ) (rv_eq : ∀ h' ∈ MDPs.ℋ h t, X h' = Y h') :

MDPs.expect_h h π t X = MDPs.expect_h h π t Y

Expected return can be expressed as a sum of expected rewards

source

def MDPs.rew_sum {σ α : Type} {M : MDPs.MDP σ α} [Inhabited α] (h : MDPs.Hist M) :

ℝ

Equations

MDPs.rew_sum h = ∑ k ∈ Finset.range h.length, M.r (MDPs.state k h) (MDPs.action k h) (MDPs.state (k + 1) h)

Instances For

source

def MDPs.rew_sum_rg {σ α : Type} {M : MDPs.MDP σ α} [Inhabited α] (b e : ℕ) (h : MDPs.Hist M) :

ℝ

Sum of rewards with start (b) and end (e) (is exclusive)

Equations

MDPs.rew_sum_rg b e h = ∑ k ∈ Finset.range (e - b), M.r (MDPs.state (b + k) h) (MDPs.action (b + k) h) (MDPs.state (b + k + 1) h)

Instances For

source

theorem MDPs.state_last {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {k : ℕ} (keq : k = h.length) :

MDPs.state k h = h.last

source

theorem MDPs.state_foll_last {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {s : σ} {a : α} {k : ℕ} (keq : k = h.length) :

MDPs.state k (h.foll a s) = h.last

source

theorem MDPs.action_last {σ α : Type} {M : MDPs.MDP σ α} {s : σ} {a : α} [Inhabited α] {h : MDPs.Hist M} {k : ℕ} (keq : k = h.length + 1) :

MDPs.action (h.foll a s).length (h.foll a s) = a

source

theorem MDPs.state_foll_eq {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {s : σ} {a : α} {k : ℕ} (kleq : k ≤ h.length) :

MDPs.state k h = MDPs.state k (h.foll a s)

source

theorem MDPs.ret_eq_sum_rew {σ α : Type} {M : MDPs.MDP σ α} [Inhabited σ] [Inhabited α] (h : MDPs.Hist M) :

MDPs.reward h = MDPs.rew_sum h

source

theorem MDPs.expret_eq_sum_rew {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : ℕ} [Inhabited σ] [Inhabited α] :

MDPs.expect_h h π t MDPs.reward = MDPs.expect_h h π t MDPs.rew_sum

Expected return can be expressed as a sum of expected rewards

source

theorem MDPs.sum_rew_eq_sum_rew_rg {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : ℕ} [Inhabited σ] [Inhabited α] :

MDPs.expect_h h π t MDPs.rew_sum = MDPs.rew_sum h + MDPs.expect_h h π t (MDPs.rew_sum_rg h.length t)

source

theorem MDPs.exph_zero_horizon_eq_zero {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} [Inhabited σ] [Inhabited α] (hzero : h.length = 0) :

MDPs.expect_h h π 0 MDPs.reward = 0

source

theorem MDPs.exph_zero_horizon_eq_zero_f {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} [Inhabited σ] [Inhabited α] (hzero : h.length = 0) :

MDPs.expect_h h π 0 (MDPs.reward_from 0) = 0

source

theorem MDPs.exph_horizon_cut {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : ℕ} [Inhabited σ] [Inhabited α] {X : MDPs.Histrv M} (k : ℕ) (kle : k ≤ t) (eqpastk : ∀ (h : MDPs.Hist M), X h = X (MDPs.Hist.prefix k h)) :

MDPs.expect_h h π t X = MDPs.expect_h h π k X

When the random variable beyond a point does not matter, cut the horizon's expectation

source

theorem MDPs.exph_horizon_trim {σ α : Type} {M : MDPs.MDP σ α} {π : MDPs.PolicyHR M} [Inhabited σ] [Inhabited α] {X : MDPs.Histrv M} {h : MDPs.Hist M} (s : { x : σ // x ∈ M.S }) (a : { x : α // x ∈ M.A }) :

X (h.foll ↑a ↑s) = X (MDPs.Hist.one h.last ↑a ↑s) → MDPs.expect_h h π 1 X = MDPs.expect_h (MDPs.Hist.init h.last) π 1 X

source

theorem MDPs.total_expectation_h {σ α : Type} {M : MDPs.MDP σ α} {ν : Type} [DecidableEq ν] {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : ℕ} {X : MDPs.Hist M → ℝ} {Y : MDPs.Hist M → ν} :

MDPs.expect_h h π t (MDPs.expect_h_cnd_rv h π t X Y) = MDPs.expect_h h π t X

source

theorem MDPs.exph_cond_eq_hist {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : ℕ} (s : { x : σ // x ∈ M.S }) (a : { x : α // x ∈ M.A }) [Inhabited α] [Inhabited σ] [BEq α] [BEq σ] :

(MDPs.expect_h_cnd h π (t + 1) MDPs.reward fun (h' : MDPs.Hist M) => decide ((MDPs.action h.length h' == ↑a) = true ∧ (MDPs.state (h.length + 1) h' == ↑s) = true)) = MDPs.expect_h (h.foll ↑a ↑s) π t MDPs.reward