Documentation

LeanMDPs.Expected

noncomputable def Finset.argmax' {τ : Type u_1} (I : Finset τ) (H : I.Nonempty) (f : { x : τ // x I }) :
{ x : τ // x I }
Equations
  • I.argmax' H f = match List.argmax f I.attach.toList, with | some x, x_1 => x
Instances For
    theorem le_of_mem_argmax' {τ : Type u_1} {a : τ} {S : Finset τ} {f : { x : τ // x S }} (h : a S) (NE : S.Nonempty) :
    f a, h f (S.argmax' NE f)
    theorem argmax_eq_sup {τ : Type u_1} {S : Finset τ} {f : { x : τ // x S }} (NE : S.Nonempty) :
    S.attach.sup' f = f (S.argmax' NE f)
    structure MDPs.ObjectiveFH {σ α : Type} (M : MDPs.MDP σ α) :

    Finite horizon objective parameters

    • s₀ : { x : σ // x M.S }
    • T :
    Instances For
      def MDPs.objective_fh {σ α : Type} {M : MDPs.MDP σ α} (O : MDPs.ObjectiveFH M) (π : MDPs.PolicyHR M) :

      Objective function

      Equations
      Instances For
        def MDPs.Optimal_fh {σ α : Type} {M : MDPs.MDP σ α} (O : MDPs.ObjectiveFH M) (πopt : MDPs.PolicyHR M) :

        An optimal policy πopt

        Equations
        Instances For
          def MDPs.ValuesH {σ α : Type} (M : MDPs.MDP σ α) :

          Value function type for history-dependent value functions

          Equations
          Instances For
            def MDPs.hvalue_π {σ α : Type} {M : MDPs.MDP σ α} (π : MDPs.PolicyHR M) :

            History-dependent value function

            Equations
            Instances For
              def MDPs.OptimalVF_fh {σ α : Type} {M : MDPs.MDP σ α} (t : ) (πopt : MDPs.PolicyHR M) :

              A value-optimal policy πopt

              Equations
              Instances For
                theorem MDPs.optimalvf_imp_optimal {σ α : Type} {M : MDPs.MDP σ α} {O : MDPs.ObjectiveFH M} (πopt : MDPs.PolicyHR M) (opt : MDPs.OptimalVF_fh O.T πopt) :
                def MDPs.DPhπ {σ α : Type} {M : MDPs.MDP σ α} (π : MDPs.PolicyHR M) (v : MDPs.ValuesH M) :

                Bellman operator on history-dependent value functions

                Equations
                Instances For
                  def MDPs.DPhopt {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (u : MDPs.ValuesH M) :

                  Bellman operator on history-dependent value functions

                  Equations
                  • One or more equations did not get rendered due to their size.
                  Instances For
                    def MDPs.u_dp_π {σ α : Type} {M : MDPs.MDP σ α} (π : MDPs.PolicyHR M) :

                    Dynamic program value function, finite-horizon history dependent

                    Equations
                    Instances For
                      def MDPs.u_dp_opt {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} :

                      Dynamic program value function, finite-horizon history dependent

                      Equations
                      Instances For
                        theorem MDPs.dp_opt_ge_dp_pi {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} (h : MDPs.Hist M) (u₁ u₂ : MDPs.ValuesH M) (uge : ∀ (h : MDPs.Hist M), u₁ h u₂ h) (h✝ : MDPs.Hist M) (π : MDPs.PolicyHR M) :
                        MDPs.DPhopt u₁ h✝ MDPs.DPhπ π u₂ h✝
                        theorem MDPs.dph_correct_vf {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} (π : MDPs.PolicyHR M) (t : ) (h : MDPs.Hist M) :
                        theorem MDPs.dph_opt_vf_opt {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} (t : ) (π : MDPs.PolicyHR M) (h : MDPs.Hist M) :
                        def MDPs.PolicyMD {σ α : Type} (M : MDPs.MDP σ α) :

                        A deterministic Markov policy. Depends on the time step, and does not depend on the horizon.

                        Equations
                        Instances For
                          Equations
                          def MDPs.Values {σ α : Type} :
                          MDPs.MDP σ αType

                          History-independent value function. Note that the optimal value function is history-independent, while the value function of a Markov policy depends on the time step.

                          Equations
                          Instances For
                            def MDPs.q_of_v {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (s : σ) (a : { x : α // x M.A }) (v : MDPs.Values M) :

                            Markov q function

                            Equations
                            Instances For
                              def MDPs.DPMopt {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (v : MDPs.Values M) :

                              Bellman operator on history-dependent value functions

                              Equations
                              Instances For
                                def MDPs.v_dp_opt {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} :

                                Optimal value function

                                Equations
                                Instances For
                                  theorem MDPs.v_dp_opt_eq_u_opt {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} {t : } (h : MDPs.Hist M) :

                                  The Markov DP is optimal

                                  noncomputable def MDPs.πopt {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (t : ) :

                                  Optimal policy for horizon t

                                  Equations
                                  Instances For
                                    theorem MDPs.v_dp_opt_eq_v_dp_π {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} {T : } (h : MDPs.Hist M) :
                                    h.length TMDPs.v_dp_opt (T - h.length) h.last = MDPs.u_dp_π (fun (h : MDPs.Hist M) => Finprob.dirac_dist M.A (MDPs.πopt T h.length h.last)) (T - h.length) h

                                    Greedy to v_opt is optimal policy

                                    def MDPs.ValuesM {σ α : Type} :
                                    MDPs.MDP σ αType
                                    Equations
                                    Instances For
                                      def MDPs.DPMπ {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (π : MDPs.PolicyMD M) (v : MDPs.ValuesM M) :

                                      Optimal Bellman operator on state-dependent value functions. Also includes the prior history's reward.

                                      Equations
                                      Instances For
                                        def MDPs.v_dp_π {σ α : Type} [DecidableEq α] {M : MDPs.MDP σ α} (π : MDPs.PolicyMD M) :

                                        Value function of a Markov policy. Horizon to value function.

                                        Equations
                                        Instances For
                                          theorem MDPs.v_eq_u_π {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} {t : } {π : MDPs.PolicyMD M} (h : MDPs.Hist M) :
                                          MDPs.u_dp_π (fun (h : MDPs.Hist M) => Finprob.dirac_dist M.A (π h.length h.last)) t h = MDPs.v_dp_π π t h.length h.last
                                          theorem MDPs.markov_u_quot {σ : Type} [Inhabited σ] [DecidableEq σ] {α : Type} [Inhabited α] [DecidableEq α] {M : MDPs.MDP σ α} {t : } {π : MDPs.PolicyMD M} (h₁ h₂ : MDPs.Hist M) :
                                          h₁.length = h₂.length h₁.last = h₂.lastMDPs.u_dp_π (fun (h : MDPs.Hist M) => Finprob.dirac_dist M.A (π h.length h.last)) t h₁ = MDPs.u_dp_π (fun (h : MDPs.Hist M) => Finprob.dirac_dist M.A (π h.length h.last)) t h₂