LeanMDPs.Expected

History-independent value function. Note that the optimal value function is history-independent, while the value function of a Markov policy depends on the time step.

Equations

MDPs.Values x✝ = (σ → ℝ)

Instances For

source

def MDPs.q_of_v {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (s : σ) (a : { x : α // x ∈ M.A }) (v : MDPs.Values M) :

ℝ

Markov q function

Equations

MDPs.q_of_v s a v = MDPs.expect_h (MDPs.Hist.init s) (fun (x : MDPs.Hist M) => Finprob.dirac_dist M.A a) 1 fun (h : MDPs.Hist M) => MDPs.reward h + v h.last

Instances For

source

def MDPs.DPMopt {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (v : MDPs.Values M) :

MDPs.Values M

Bellman operator on history-dependent value functions

Equations

MDPs.DPMopt v x✝ = M.A.attach.sup' ⋯ fun (a : { x : α // x ∈ M.A }) => MDPs.q_of_v x✝ a v

Instances For

source

def MDPs.v_dp_opt {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} :

ℕ → MDPs.Values M

Optimal value function

Equations

MDPs.v_dp_opt Nat.zero = fun (x : σ) => 0
MDPs.v_dp_opt t.succ = MDPs.DPMopt (MDPs.v_dp_opt t)

Instances For

source

theorem MDPs.v_dp_opt_eq_u_opt {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} {t : ℕ} (h : MDPs.Hist M) :

MDPs.v_dp_opt t h.last = MDPs.u_dp_opt t h

The Markov DP is optimal

source

noncomputable def MDPs.πopt {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (t : ℕ) :

MDPs.PolicyMD M

Optimal policy for horizon t

Equations

MDPs.πopt t x✝¹ x✝ = if t ≥ x✝¹ then M.A.argmax' ⋯ fun (a : { x : α // x ∈ M.A }) => MDPs.q_of_v x✝ a (MDPs.v_dp_opt (t - x✝¹)) else Classical.indefiniteDescription (fun (a : α) => a ∈ M.A) ⋯

Instances For

source

theorem MDPs.v_dp_opt_eq_v_dp_π {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} {T : ℕ} (h : MDPs.Hist M) :

h.length ≤ T → MDPs.v_dp_opt (T - h.length) h.last = MDPs.u_dp_π (fun (h : MDPs.Hist M) => Finprob.dirac_dist M.A (MDPs.πopt T h.length h.last)) (T - h.length) h

Greedy to v_opt is optimal policy

source

def MDPs.ValuesM {σ α : Type} :

MDPs.MDP σ α → Type

Equations

MDPs.ValuesM x✝ = (ℕ → σ → ℝ)

Instances For

source

def MDPs.DPMπ {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (π : MDPs.PolicyMD M) (v : MDPs.ValuesM M) :

MDPs.ValuesM M

Optimal Bellman operator on state-dependent value functions. Also includes the prior history's reward.

Equations

MDPs.DPMπ π v x✝¹ x✝ = MDPs.expect_h (MDPs.Hist.init x✝) (fun (h : MDPs.Hist M) => Finprob.dirac_dist M.A (π h.length h.last)) 1 fun (h : MDPs.Hist M) => MDPs.reward h + v (x✝¹ + 1) h.last

Instances For

source

def MDPs.v_dp_π {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (π : MDPs.PolicyMD M) :

ℕ → MDPs.ValuesM M

Value function of a Markov policy. Horizon to value function.

Equations

MDPs.v_dp_π π Nat.zero = fun (x : ℕ) => 0
MDPs.v_dp_π π t.succ = MDPs.DPMπ π (MDPs.v_dp_π π t)

Instances For

source

theorem MDPs.v_eq_u_π {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} {t : ℕ} {π : MDPs.PolicyMD M} (h : MDPs.Hist M) :

MDPs.u_dp_π (fun (h : MDPs.Hist M) => Finprob.dirac_dist M.A (π h.length h.last)) t h = MDPs.v_dp_π π t h.length h.last

source

theorem MDPs.markov_u_quot {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} {t : ℕ} {π : MDPs.PolicyMD M} (h₁ h₂ : MDPs.Hist M) :

h₁.length = h₂.length ∧ h₁.last = h₂.last → MDPs.u_dp_π (fun (h : MDPs.Hist M) => Finprob.dirac_dist M.A (π h.length h.last)) t h₁ = MDPs.u_dp_π (fun (h : MDPs.Hist M) => Finprob.dirac_dist M.A (π h.length h.last)) t h₂