Step 3 : Upgrade our components

Improve the Environment

In the client implementation of the previous step, we can display the previous decision of the human and agent by adding previous_p1_action and previous_p2_action to the observation. Modify the .proto file as follows:

data.proto

...
message Observation {
    int32 p1_score = 1;
    int32 p2_score = 2;
    HumanAction previous_p1_action = 3;
    AgentAction previous_p2_action = 4;
}
...

When the .proto files are modified, we always need to generate the config again:

cogment generate --python_dir=.

Modify the env file as follows in order to have these values updated.

env.py

    ...
    def update(self, actions):
        print("environment updating")

        self.observation.previous_p1_action.CopyFrom(actions.human[0])
        self.observation.previous_p2_action.CopyFrom(actions.agent[0])

        p1_decision = actions.human[0].decision
    ...

Send feedback

Rewarding the agent is an important part of training a model. In this example, after each observation, the environment will send feedbacks to our single agent.

The orchestator calculates the average of all the feedback in order to create a single reward.

env.py

...
def update(self, actions):
    ...
    print(f"human played {choice[p1_decision]} - agent played {choice[p2_decision]}")

    p1_feedback = 0
    p2_feedback = 0
    if p1_decision == p2_decision:
        pass
    elif ((p1_decision == ROCK and p2_decision == SCISSOR) or
          (p1_decision == PAPER and p2_decision == ROCK) or
          (p1_decision == SCISSOR and p2_decision == PAPER)):
        self.observation.p1_score += 1
        p1_feedback = 1
        p2_feedback -= p1_feedback
    else:
        self.observation.p2_score += 1
        p1_feedback = -1
        p2_feedback -= p1_feedback

    print(f"human score {self.observation.p1_score} - agent score {self.observation.p2_score}")

    self.trial.actors.human[0].add_feedback(value=p1_feedback, confidence=1)
    self.trial.actors.agent[0].add_feedback(value=p2_feedback, confidence=1)

    obs_table = cog_settings.ObservationsTable(self.trial)
    for o in obs_table.all_observations():
        o.snapshot = self.observation

    return obs_table
...

Restart the environment by running docker-compose restart env

Improve the Agent

This section will outline how to include a supervised learning model for RPS which predicts actions for the player. Download the model to the root of the project (the “rps” folder) from gitlab.

New python dependencies are required, so we’ll update the rps/agents/Dockerfile:

rps/agents/Dockerfile

FROM python:3.7

ENV PYTHONPATH /app

# Uncomment following and set specific version if required for only this service
#ENV COGMENT_VERSION 0.3.0a5
#RUN pip install cogment==$COGMENT_VERSION
# Comment following if above is used
ADD requirements.txt .
RUN pip install -r requirements.txt
RUN pip install keras==2.2.4 \
    tensorflow==1.14.0\
    numpy==1.17.2

WORKDIR /app

ADD agents/agent ./agents/agent
ADD cog_settings.py .
ADD *_pb2.py .

CMD ["python", "agents/agent/main.py"]

Rebuild the image -

docker-compose build agent

Update the agent -

rps/agents/agent.py

import cog_settings
import numpy as np

from data_pb2 import AgentAction, ROCK, PAPER, SCISSOR
import random
import tensorflow as tf

from collections import deque
from keras.models import load_model
from data_pb2 import AgentAction, NONE, ROCK, PAPER, SCISSOR
from cogment import Agent, GrpcServer

global graph, model
graph = tf.get_default_graph()

model = load_model('model_supervised.h5')


def GetPrediction(moves):
    if len(moves) < 18:
        moves = moves + ([0] * (18-len(moves)))

    moves_np = np.array([moves])

    with graph.as_default():
        prediction = model.predict(moves_np).argmax()

    return prediction

def GetAgentMove(moves):
    prediction = GetPrediction(moves)
    to_play = {
        ROCK: PAPER,
        PAPER: SCISSOR,
        SCISSOR: ROCK,
    }

    return to_play[prediction]

class Agent(Agent):
    VERSIONS = {"agent": "1.0.0"}
    actor_class = cog_settings.actor_classes.agent

    def __init__(self, trial, actor, config):
        super().__init__(trial, actor, config)
        self.moves_history = deque(maxlen=18)

    def decide(self, observation):

        # p2 is the ai and p1 is the human
        if observation.previous_p1_action is not NONE:
            self.moves_history.append(observation.previous_p2_action.decision)
            self.moves_history.append(observation.previous_p1_action.decision)

        action = AgentAction()
        action.decision = GetAgentMove(list(self.moves_history))

        print(f"Agent decide {action.decision}")
        return action

    def reward(self, reward):
        print("Player reward")

    def on_message(self, sender, msg):
        if msg:
            print(f'Agent {self.id_in_class} received message - {msg} from sender {sender}')

    def end(self):
        print("Player end")



if __name__ == '__main__':
    server = GrpcServer(Agent, cog_settings)
    server.serve()