Documentation

LeanMDPs.Histories

structure MDPs.MDP (σ α : Type) :

Markov decision process

  • S : Finset σ

    states

  • S_ne : self.S.Nonempty
  • A : Finset α

    actions

  • A_ne : self.A.Nonempty
  • P : σαΔ self.S

    transition probability s, a, s'

  • r : σασ

    reward function s, a, s'

Instances For
    inductive MDPs.Hist {σ α : Type} (M : MDPs.MDP σ α) :

    Represents a history. The state is type σ and action is type α.

    Instances For
      instance MDPs.instCoeHist {σ α : Type} {M : MDPs.MDP σ α} :
      Coe σ (MDPs.Hist M)

      Coerces a state to a history

      Equations
      @[reducible]
      def MDPs.Hist.length {σ α : Type} {M : MDPs.MDP σ α} :

      The length of the history corresponds to the zero-based step of the decision

      Equations
      Instances For
        @[reducible, inline]
        abbrev MDPs.HistNE {σ α : Type} (m : MDPs.MDP σ α) :

        Nonempty histories

        Equations
        Instances For
          def MDPs.Hist.last {σ α : Type} {M : MDPs.MDP σ α} :
          MDPs.Hist Mσ

          Returns the last state of the history

          Equations
          Instances For
            def MDPs.Hist.append {σ α : Type} {M : MDPs.MDP σ α} (h : MDPs.Hist M) (as : α × σ) :

            Appends the state and action to the history -

            Equations
            • h.append as = h.foll as.1 as.2
            Instances For
              def MDPs.Hist.one {σ α : Type} {M : MDPs.MDP σ α} (s₀ : σ) (a : α) (s : σ) :
              Equations
              Instances For
                def MDPs.Hist.prefix {σ α : Type} {M : MDPs.MDP σ α} (k : ) (h : MDPs.Hist M) :

                Return the prefix of hist of length k

                Equations
                Instances For
                  def MDPs.tuple2hist {σ α : Type} {M : MDPs.MDP σ α} :
                  MDPs.Hist M × α × σMDPs.HistNE M
                  Equations
                  Instances For
                    def MDPs.hist2tuple {σ α : Type} {M : MDPs.MDP σ α} :
                    MDPs.HistNE MMDPs.Hist M × α × σ
                    Equations
                    Instances For
                      def MDPs.emb_tuple2hist_l1 {σ α : Type} {M : MDPs.MDP σ α} :
                      Equations
                      Instances For
                        def MDPs.emb_tuple2hist {σ α : Type} {M : MDPs.MDP σ α} :
                        Equations
                        Instances For
                          def MDPs.state2hist {σ α : Type} {M : MDPs.MDP σ α} (s : σ) :
                          Equations
                          Instances For
                            def MDPs.hist2state {σ α : Type} {M : MDPs.MDP σ α} :
                            MDPs.Hist Mσ
                            Equations
                            Instances For
                              def MDPs.state2hist_emb {σ α : Type} {M : MDPs.MDP σ α} :
                              Equations
                              Instances For
                                def MDPs.isprefix {σ α : Type} {M : MDPs.MDP σ α} :

                                Checks if the first hist is the prefix of the second hist.

                                Equations
                                Instances For
                                  def MDPs.Histories {σ α : Type} {M : MDPs.MDP σ α} (h : MDPs.Hist M) :

                                  All histories of additional length t that follow history h

                                  Equations
                                  Instances For
                                    @[reducible, inline]
                                    abbrev MDPs. {σ α : Type} {M : MDPs.MDP σ α} :
                                    Equations
                                    Instances For
                                      theorem MDPs.hist_lenth_eq_horizon {σ α : Type} {M : MDPs.MDP σ α} (h : MDPs.Hist M) (t : ) (h' : MDPs.Hist M) :
                                      h' MDPs.ℋ h th'.length = h.length + t
                                      @[reducible, inline]
                                      abbrev MDPs.ℋₜ {σ α : Type} {M : MDPs.MDP σ α} :
                                      Equations
                                      Instances For
                                        def MDPs.PolicyHR {σ α : Type} (M : MDPs.MDP σ α) :

                                        A randomized history-dependent policy

                                        Equations
                                        Instances For
                                          instance MDPs.instCoeSubtypeMemFinsetAPolicyHROfDecidableEq {σ α : Type} {M : MDPs.MDP σ α} [DecidableEq α] :
                                          Coe { x : α // x M.A } (MDPs.PolicyHR M)
                                          Equations
                                          def MDPs.DecisionRule {σ α : Type} (M : MDPs.MDP σ α) :

                                          Decision rule

                                          Equations
                                          Instances For
                                            instance MDPs.instCoeSubtypeMemFinsetADecisionRuleOfDecidableEq {σ α : Type} {M : MDPs.MDP σ α} [DecidableEq α] :
                                            Coe { x : α // x M.A } (MDPs.DecisionRule M)
                                            Equations
                                            def MDPs.PolicySD {σ α : Type} (M : MDPs.MDP σ α) :

                                            A deterministic stationary policy

                                            Equations
                                            Instances For
                                              def MDPs.HistDist {σ α : Type} {M : MDPs.MDP σ α} (h : MDPs.Hist M) (π : MDPs.PolicyHR M) (T : ) :
                                              Δ (MDPs.ℋ h T)

                                              Probability distribution over histories induced by the policy and transition probabilities

                                              Equations
                                              Instances For
                                                @[reducible, inline]
                                                abbrev MDPs.Δℋ {σ α : Type} {M : MDPs.MDP σ α} (h : MDPs.Hist M) (π : MDPs.PolicyHR M) (T : ) :
                                                Equations
                                                Instances For
                                                  def MDPs.reward {σ α : Type} {M : MDPs.MDP σ α} :

                                                  Reward of a history

                                                  Equations
                                                  Instances For
                                                    def MDPs.reward_at {σ α : Type} {M : MDPs.MDP σ α} (i : ) :

                                                    Reward at a specific position; 0-based

                                                    Equations
                                                    Instances For
                                                      def MDPs.reward_to {σ α : Type} {M : MDPs.MDP σ α} (j : ) :

                                                      Sum of rewards from a specific position to the end

                                                      Equations
                                                      Instances For
                                                        def MDPs.reward_from {σ α : Type} {M : MDPs.MDP σ α} (j : ) :

                                                        Sum of rewards from a specific position to the end

                                                        Equations
                                                        Instances For
                                                          def MDPs.prob_h {σ α : Type} {M : MDPs.MDP σ α} (h : MDPs.Hist M) (π : MDPs.PolicyHR M) (T : ) (h' : { x : MDPs.Hist M // x MDPs.ℋ h T }) :

                                                          The probability of a history

                                                          Equations
                                                          Instances For
                                                            def MDPs.probability_h {σ α : Type} {M : MDPs.MDP σ α} (h : MDPs.Hist M) (π : MDPs.PolicyHR M) (T : ) (B : MDPs.Hist MBool) :

                                                            Probability of a boolean event

                                                            Equations
                                                            Instances For
                                                              Equations
                                                              • One or more equations did not get rendered due to their size.
                                                              Instances For
                                                                def MDPs.expect_h {σ α : Type} {M : MDPs.MDP σ α} (h : MDPs.Hist M) (π : MDPs.PolicyHR M) (T : ) (X : MDPs.Hist M) :

                                                                Expectation over histories for a r.v. X for horizon T and policy π

                                                                Equations
                                                                Instances For
                                                                  Equations
                                                                  • One or more equations did not get rendered due to their size.
                                                                  Instances For
                                                                    noncomputable def MDPs.expect_h_cnd {σ α : Type} {M : MDPs.MDP σ α} (h : MDPs.Hist M) (π : MDPs.PolicyHR M) (T : ) (X : MDPs.Hist M) (B : MDPs.Hist MBool) :

                                                                    Condtional expectation over histories for a r.v. X for horizon T and policy π

                                                                    Equations
                                                                    Instances For
                                                                      Equations
                                                                      • One or more equations did not get rendered due to their size.
                                                                      Instances For
                                                                        noncomputable def MDPs.expect_h_cnd_rv {σ α : Type} {M : MDPs.MDP σ α} (h : MDPs.Hist M) (π : MDPs.PolicyHR M) (T : ) (X : MDPs.Hist M) {ν : Type} [DecidableEq ν] (Y : MDPs.Hist Mν) :
                                                                        Equations
                                                                        Instances For
                                                                          Equations
                                                                          • One or more equations did not get rendered due to their size.
                                                                          Instances For
                                                                            def MDPs.state {σ α : Type} {M : MDPs.MDP σ α} (k : ) (h : MDPs.Hist M) :
                                                                            σ

                                                                            The k-th state of a history. The initial state is state 0.

                                                                            Equations
                                                                            Instances For
                                                                              def MDPs.action {σ α : Type} {M : MDPs.MDP σ α} [Inhabited α] (k : ) (h : MDPs.Hist M) :
                                                                              α

                                                                              The k-th action of a history. The first action is action 0.

                                                                              Equations
                                                                              Instances For
                                                                                def MDPs.Histrv {σ α : Type} (M : MDPs.MDP σ α) :

                                                                                Random variable on histories sans distribution (policy dependent)

                                                                                Equations
                                                                                Instances For
                                                                                  instance MDPs.instCoeRtoRV {σ α : Type} {M : MDPs.MDP σ α} :
                                                                                  Equations
                                                                                  instance MDPs.instHAddHRV {σ α : Type} {M : MDPs.MDP σ α} :
                                                                                  Equations
                                                                                  instance MDPs.instHAddRVRV {σ α : Type} {M : MDPs.MDP σ α} :
                                                                                  Equations
                                                                                  theorem MDPs.exph_add_rv {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : } (X Y : MDPs.Histrv M) :
                                                                                  MDPs.expect_h h π t (X + Y) = MDPs.expect_h h π t X + MDPs.expect_h h π t Y
                                                                                  theorem MDPs.exph_const {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : } (X : MDPs.Histrv M) (c : ) :
                                                                                  (MDPs.expect_h h π t fun (x : MDPs.Hist M) => c) = c
                                                                                  theorem MDPs.exph_add_const {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : } (X : MDPs.Hist M) (c : ) :
                                                                                  MDPs.expect_h h π t ((fun (x : MDPs.Hist M) => c) + X) = c + MDPs.expect_h h π t X
                                                                                  theorem MDPs.exph_congr {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : } (X Y : MDPs.Hist M) (rv_eq : h'MDPs.ℋ h t, X h' = Y h') :
                                                                                  MDPs.expect_h h π t X = MDPs.expect_h h π t Y

                                                                                  Expected return can be expressed as a sum of expected rewards

                                                                                  def MDPs.rew_sum {σ α : Type} {M : MDPs.MDP σ α} [Inhabited α] (h : MDPs.Hist M) :
                                                                                  Equations
                                                                                  Instances For
                                                                                    def MDPs.rew_sum_rg {σ α : Type} {M : MDPs.MDP σ α} [Inhabited α] (b e : ) (h : MDPs.Hist M) :

                                                                                    Sum of rewards with start (b) and end (e) (is exclusive)

                                                                                    Equations
                                                                                    Instances For
                                                                                      theorem MDPs.state_last {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {k : } (keq : k = h.length) :
                                                                                      MDPs.state k h = h.last
                                                                                      theorem MDPs.state_foll_last {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {s : σ} {a : α} {k : } (keq : k = h.length) :
                                                                                      MDPs.state k (h.foll a s) = h.last
                                                                                      theorem MDPs.action_last {σ α : Type} {M : MDPs.MDP σ α} {s : σ} {a : α} [Inhabited α] {h : MDPs.Hist M} {k : } (keq : k = h.length + 1) :
                                                                                      MDPs.action (h.foll a s).length (h.foll a s) = a
                                                                                      theorem MDPs.state_foll_eq {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {s : σ} {a : α} {k : } (kleq : k h.length) :
                                                                                      MDPs.state k h = MDPs.state k (h.foll a s)
                                                                                      theorem MDPs.ret_eq_sum_rew {σ α : Type} {M : MDPs.MDP σ α} [Inhabited σ] [Inhabited α] (h : MDPs.Hist M) :
                                                                                      theorem MDPs.expret_eq_sum_rew {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : } [Inhabited σ] [Inhabited α] :

                                                                                      Expected return can be expressed as a sum of expected rewards

                                                                                      theorem MDPs.sum_rew_eq_sum_rew_rg {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : } [Inhabited σ] [Inhabited α] :
                                                                                      theorem MDPs.exph_zero_horizon_eq_zero {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} [Inhabited σ] [Inhabited α] (hzero : h.length = 0) :
                                                                                      theorem MDPs.exph_zero_horizon_eq_zero_f {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} [Inhabited σ] [Inhabited α] (hzero : h.length = 0) :
                                                                                      theorem MDPs.exph_horizon_cut {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : } [Inhabited σ] [Inhabited α] {X : MDPs.Histrv M} (k : ) (kle : k t) (eqpastk : ∀ (h : MDPs.Hist M), X h = X (MDPs.Hist.prefix k h)) :
                                                                                      MDPs.expect_h h π t X = MDPs.expect_h h π k X

                                                                                      When the random variable beyond a point does not matter, cut the horizon's expectation

                                                                                      theorem MDPs.exph_horizon_trim {σ α : Type} {M : MDPs.MDP σ α} {π : MDPs.PolicyHR M} [Inhabited σ] [Inhabited α] {X : MDPs.Histrv M} {h : MDPs.Hist M} (s : { x : σ // x M.S }) (a : { x : α // x M.A }) :
                                                                                      X (h.foll a s) = X (MDPs.Hist.one h.last a s)MDPs.expect_h h π 1 X = MDPs.expect_h (MDPs.Hist.init h.last) π 1 X
                                                                                      theorem MDPs.total_expectation_h {σ α : Type} {M : MDPs.MDP σ α} {ν : Type} [DecidableEq ν] {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : } {X : MDPs.Hist M} {Y : MDPs.Hist Mν} :
                                                                                      MDPs.expect_h h π t (MDPs.expect_h_cnd_rv h π t X Y) = MDPs.expect_h h π t X
                                                                                      theorem MDPs.exph_cond_eq_hist {σ α : Type} {M : MDPs.MDP σ α} {h : MDPs.Hist M} {π : MDPs.PolicyHR M} {t : } (s : { x : σ // x M.S }) (a : { x : α // x M.A }) [Inhabited α] [Inhabited σ] [BEq α] [BEq σ] :
                                                                                      (MDPs.expect_h_cnd h π (t + 1) MDPs.reward fun (h' : MDPs.Hist M) => decide ((MDPs.action h.length h' == a) = true (MDPs.state (h.length + 1) h' == s) = true)) = MDPs.expect_h (h.foll a s) π t MDPs.reward