// sarsa_alg.h : // Sarsa algorithm implementation - it's on-line // algorithm which also defines policy #include #include template class Sarsa_Alg : public StepLrn_Alg, public Policy { Action StepAct; // action choosen after previous step double Eps; // parameter for eps-greedy policy const RAND_MAX = 2147483647; public: Sarsa_Alg (double eps,double lambdaVal=1.0) : StepLrn_Alg (lambdaVal) { Eps = eps; } virtual void start_run (State startState) { CurrState = startState; RunCount++; StepAct = find_next_action(startState);//we always need to now the //next action } Action find_next_action (State theState) { // implements eps-greedy policy!! Action best_act = find_best(theState); double randTest = double(rand())/RAND_MAX; // cout << "<" << theState << ">," << best_act << "," << randTest << "\n"; if (randTest < Eps) return best_act; else { int actCount = rand()%get_count (theState); return act_by_count(theState,actCount); } } virtual Action choose_action (State theState) { return StepAct; // this action was choosed while receiving //previous step/start state. } // this is real update performed by Sarsa! void get_step (Action stepAction, State nextState, double stepReward) { StepCount++; StepAct = find_next_action(nextState); // cerr << CurrState << "," << stepAction << "," << nextState << "," << StepAct << "\n"; StepsData [Step(CurrState,stepAction)] += (1.0/sqrt(StepCount))*(stepReward + Lambda*StepsData[Step(nextState,StepAct)] - StepsData[Step(CurrState,stepAction)]); // cerr << stepReward << "," << StepsData[Step(nextState,StepAct)] << "," << StepsData [Step(CurrState,stepAction)] << "\n"; CurrState = nextState; } void finish_run () { //Do nothing } };