amoux · March 6, 2022 05:45 · Mar 6, 2022 · Mar 6, 2022
diff --git a/char2char-rnn-with-ego.ipynb b/char2char-rnn-with-ego.ipynb
@@ -1,5 +1,15 @@
 {
   "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/gist/amoux/b0ea4db5d5c3f29c8205bd24aee1b47c/char2char-rnn-with-ego.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -966,7 +976,8 @@
     },
     "colab": {
       "name": "Char2Char RNN with ego 🔥.ipynb",
-      "provenance": []
+      "provenance": [],
+      "include_colab_link": true
     }
   },
   "nbformat": 4,

diff --git a/char2char-rnn-with-ego.ipynb b/char2char-rnn-with-ego.ipynb
@@ -0,0 +1,974 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d0206cbf",
+      "metadata": {
+        "id": "d0206cbf"
+      },
+      "outputs": [],
+      "source": [
+        "import ego\n",
+        "import ego.nn as nn\n",
+        "import ego.nn.functional as F\n",
+        "import ego.optim as optim\n",
+        "import ego.text as text"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c552c25c",
+      "metadata": {
+        "id": "c552c25c",
+        "outputId": "2777c7b6-f335-4919-c4a7-3de87b90af4b"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "Tokenizer(vocab_size=19, maxlen=15, type=char, base=<CharPreTokenization>)"
+            ]
+          },
+          "execution_count": 2,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "corpus = ['hey how are you',\n",
+        "          'good i am fine',\n",
+        "          'have a nice day']\n",
+        "\n",
+        "tokenizer = text.Tokenizer('char', min_freq=0, lower_case=True, \n",
+        "                           use_special_tokens=False).fit(corpus)\n",
+        "tokenizer"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "162f2d20",
+      "metadata": {
+        "id": "162f2d20",
+        "outputId": "0aea901f-e391-4c12-bdc4-04b11f0ac83f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "x : hey how are yo\n",
+            "y : ey how are you\n",
+            "x : good i am fine\n",
+            "y : ood i am fine \n",
+            "x : have a nice da\n",
+            "y : ave a nice day\n"
+          ]
+        }
+      ],
+      "source": [
+        "PAD = \" \"\n",
+        "seqs = corpus.copy()\n",
+        "for i in range(len(seqs)):\n",
+        "    while len(seqs[i]) < tokenizer.maxlen:\n",
+        "        seqs[i] += PAD\n",
+        "input_seqs = []\n",
+        "target_seqs = []\n",
+        "for i, seq in enumerate(seqs):\n",
+        "    assert len(seq) == tokenizer.maxlen\n",
+        "    input_seqs.append(seq[:-1])\n",
+        "    target_seqs.append(seq[1:])\n",
+        "    print(f\"x : {input_seqs[i]}\\ny : {target_seqs[i]}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "5c9e72a5",
+      "metadata": {
+        "id": "5c9e72a5",
+        "outputId": "a1b24753-20f1-4a0b-fb0e-1c2d81f0cbbb"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "tensor([[ 6,  3,  7,  2,  6,  5, 11,  2,  4, 12,  3,  2,  7,  5]])\n",
+            "tensor([[ 3,  7,  2,  6,  5, 11,  2,  4, 12,  3,  2,  7,  5, 13]]) \n",
+            "\n",
+            "['h', 'e', 'y', ' ', 'h', 'o', 'w', ' ', 'a', 'r', 'e', ' ', 'y', 'o']\n",
+            "['e', 'y', ' ', 'h', 'o', 'w', ' ', 'a', 'r', 'e', ' ', 'y', 'o', 'u']\n",
+            "X: ego.Size([3, 14, 19]) | y: ego.Size([3, 14])\n"
+          ]
+        }
+      ],
+      "source": [
+        "input_ids = tokenizer(input_seqs, return_attention_mask=False,\n",
+        "                      return_tensors=\"ego\")\n",
+        "target_ids = tokenizer(target_seqs, return_attention_mask=False,\n",
+        "                       return_tensors=\"ego\")\n",
+        "input_features = F.one_hot(input_ids, num_classes=tokenizer.vocab_size)\n",
+        "\n",
+        "print(input_ids[:1])\n",
+        "print(target_ids[:1], '\\n')\n",
+        "print(tokenizer.decode(input_ids[0]))\n",
+        "print(tokenizer.decode(target_ids[0]))\n",
+        "print(\"X: {} | y: {}\".format(input_features.size(), target_ids.size()))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "6cba9c06",
+      "metadata": {
+        "id": "6cba9c06",
+        "outputId": "b03275b6-3ce4-4528-b281-a9fbac18146d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Char2CharRNN(\n",
+            "  (rnn): GRU(19, 32, batch_first=True)\n",
+            "  (fc): Linear(in_features=32, out_features=19, bias=True)\n",
+            ")\n"
+          ]
+        }
+      ],
+      "source": [
+        "class Char2CharRNN(nn.Module):\n",
+        "    def __init__(self, ninp, nout, nhid, nlayer=1, dropout=0, is_batch=True):\n",
+        "        super(Char2CharRNN, self).__init__()\n",
+        "        self.nhid = nhid\n",
+        "        self.rnn = nn.GRU(ninp, nhid, nlayer, bias=True,\n",
+        "                          batch_first=is_batch, dropout=dropout)\n",
+        "        self.fc = nn.Linear(nhid, nout)\n",
+        "\n",
+        "    def inference(self):\n",
+        "        self.eval()  # set self, parent and children modules to eval mode\n",
+        "        # set the requires_grad flag to False for all defined parameters\n",
+        "        self.requires_grad_(False)\n",
+        "        # clear accumulated gradients for all defined parameters (if any)\n",
+        "        self.zero_grad(set_to_none=True)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        ih, hh = self.rnn(x)\n",
+        "        hidden = ih.view(-1, self.nhid)\n",
+        "        logits = self.fc(hidden)\n",
+        "        if self.training:\n",
+        "            # keep tensor 2D from <Linear>[BxT, Nt] -> <NLLLoss>(x:2D, y:1D)\n",
+        "            return logits\n",
+        "        else:\n",
+        "            # otherwise, reshape as 3D<OneHot>[B, T, Nt]\n",
+        "            return logits.reshape_as(x)\n",
+        "\n",
+        "\n",
+        "# vocab_size, hidden_size, num_layers\n",
+        "Nt, H, L = (tokenizer.vocab_size, 32, 1)\n",
+        "\n",
+        "model = Char2CharRNN(Nt, Nt, H, L)\n",
+        "criterion = nn.CrossEntropyLoss()\n",
+        "optimizer = optim.RMSprop(model.parameters(), lr=0.01)\n",
+        "print(model)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d5a57b12",
+      "metadata": {
+        "scrolled": true,
+        "id": "d5a57b12",
+        "outputId": "4eac87fb-ab4a-41f7-91b3-989bc48118a3"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "epoch : 0/50\t.......... loss  : 2.9805\n",
+            "epoch : 1/50\t.......... loss  : 2.6823\n",
+            "epoch : 2/50\t.......... loss  : 2.5745\n",
+            "epoch : 3/50\t.......... loss  : 2.2057\n",
+            "epoch : 4/50\t.......... loss  : 1.8892\n",
+            "epoch : 5/50\t.......... loss  : 1.5189\n",
+            "epoch : 6/50\t.......... loss  : 1.2244\n",
+            "epoch : 7/50\t.......... loss  : 1.0763\n",
+            "epoch : 8/50\t.......... loss  : 0.9662\n",
+            "epoch : 9/50\t.......... loss  : 0.6695\n",
+            "epoch : 10/50\t.......... loss  : 0.5876\n",
+            "epoch : 11/50\t.......... loss  : 0.4450\n",
+            "epoch : 12/50\t.......... loss  : 0.3818\n",
+            "epoch : 13/50\t.......... loss  : 0.3738\n",
+            "epoch : 14/50\t.......... loss  : 0.2685\n",
+            "epoch : 15/50\t.......... loss  : 0.1938\n",
+            "epoch : 16/50\t.......... loss  : 0.1572\n",
+            "epoch : 17/50\t.......... loss  : 0.1356\n",
+            "epoch : 18/50\t.......... loss  : 0.1208\n",
+            "epoch : 19/50\t.......... loss  : 0.1094\n",
+            "epoch : 20/50\t.......... loss  : 0.1004\n",
+            "epoch : 21/50\t.......... loss  : 0.0931\n",
+            "epoch : 22/50\t.......... loss  : 0.0872\n",
+            "epoch : 23/50\t.......... loss  : 0.0822\n",
+            "epoch : 24/50\t.......... loss  : 0.0780\n",
+            "epoch : 25/50\t.......... loss  : 0.0743\n",
+            "epoch : 26/50\t.......... loss  : 0.0712\n",
+            "epoch : 27/50\t.......... loss  : 0.0684\n",
+            "epoch : 28/50\t.......... loss  : 0.0660\n",
+            "epoch : 29/50\t.......... loss  : 0.0639\n",
+            "epoch : 30/50\t.......... loss  : 0.0620\n",
+            "epoch : 31/50\t.......... loss  : 0.0602\n",
+            "epoch : 32/50\t.......... loss  : 0.0587\n",
+            "epoch : 33/50\t.......... loss  : 0.0573\n",
+            "epoch : 34/50\t.......... loss  : 0.0560\n",
+            "epoch : 35/50\t.......... loss  : 0.0549\n",
+            "epoch : 36/50\t.......... loss  : 0.0538\n",
+            "epoch : 37/50\t.......... loss  : 0.0528\n",
+            "epoch : 38/50\t.......... loss  : 0.0519\n",
+            "epoch : 39/50\t.......... loss  : 0.0511\n",
+            "epoch : 40/50\t.......... loss  : 0.0503\n",
+            "epoch : 41/50\t.......... loss  : 0.0496\n",
+            "epoch : 42/50\t.......... loss  : 0.0490\n",
+            "epoch : 43/50\t.......... loss  : 0.0483\n",
+            "epoch : 44/50\t.......... loss  : 0.0478\n",
+            "epoch : 45/50\t.......... loss  : 0.0472\n",
+            "epoch : 46/50\t.......... loss  : 0.0467\n",
+            "epoch : 47/50\t.......... loss  : 0.0462\n",
+            "epoch : 48/50\t.......... loss  : 0.0458\n",
+            "epoch : 49/50\t.......... loss  : 0.0453\n"
+          ]
+        }
+      ],
+      "source": [
+        "epochs = 50\n",
+        "x = input_features\n",
+        "y = target_ids.view(-1)\n",
+        "for epoch in range(epochs):\n",
+        "    optimizer.zero_grad()\n",
+        "    yhat = model(x)\n",
+        "    loss = criterion(yhat, y)\n",
+        "    loss.backward()\n",
+        "    nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n",
+        "    optimizer.step()\n",
+        "    print('epoch : {}/{}\\t..........'.format(epoch, epochs), end=' ')\n",
+        "    print('loss  : {:.4f}'.format(loss.item()))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "e8abf015",
+      "metadata": {
+        "id": "e8abf015"
+      },
+      "outputs": [],
+      "source": [
+        "model.inference() # set the model in inference mode"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "94233747",
+      "metadata": {
+        "id": "94233747"
+      },
+      "outputs": [],
+      "source": [
+        "def print_pred(prompt_text, next_token, score):\n",
+        "    print(f'predicted token: {prompt_text} + [{tokenizer[next_token]}] '\n",
+        "          f'| ~P(next|prompt) = {score.item():.4f}%')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "21d6402b",
+      "metadata": {
+        "id": "21d6402b",
+        "outputId": "f196bb44-4ce4-49ee-b8f1-4fa379156381"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "tensor([[[-1.7822, -2.0948,  0.0980,  5.4613,  5.4769,  0.9236, -2.9440, -0.2204,\n",
+              "          -0.5596, -1.2992, -1.4227, -1.4685, -1.4688, -3.1646, -2.0938, -2.5651,\n",
+              "          -3.2602,  0.2805, -1.9711],\n",
+              "         [-2.0338, -1.8651,  2.7062, -1.1563,  0.1298, -0.1714, -1.7824,  7.6752,\n",
+              "          -5.0695, -0.5060, -2.5180,  0.4877, -0.9850, -1.4442, -2.2192,  0.2705,\n",
+              "          -3.1254,  3.0572, -2.8077]]])"
+            ]
+          },
+          "execution_count": 9,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "prompt_text = \"he\"\n",
+        "prompt_tensor = tokenizer.encode(prompt_text, return_tensors=\"ego\").unsqueeze(0)\n",
+        "prompt_tensor = F.one_hot(prompt_tensor, tokenizer.vocab_size)\n",
+        "output = model(prompt_tensor)\n",
+        "output"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "9848127f",
+      "metadata": {
+        "id": "9848127f",
+        "outputId": "cbf214cf-30f0-49a4-cf33-941ae31c817e"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "predicted token: he + [a] | ~P(next|prompt) = 0.4955%\n",
+            "\n",
+            "\t_________________________________\n",
+            "\n",
+            "predicted token: he + [y] | ~P(next|prompt) = 0.9802%\n"
+          ]
+        }
+      ],
+      "source": [
+        "ith_layer_0 = 0  # first layer hidden states\n",
+        "ith_layer_1 = 1  # second layer hidden states\n",
+        "\n",
+        "score, next_token = output[:, ith_layer_0].softmax(-1).topk(1)\n",
+        "print_pred(prompt_text, next_token, score)\n",
+        "print('\\n\\t_________________________________\\n')\n",
+        "\n",
+        "score, next_token = output[:, ith_layer_1].softmax(-1).topk(1)\n",
+        "print_pred(prompt_text, next_token, score)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d6af77f4",
+      "metadata": {
+        "id": "d6af77f4"
+      },
+      "outputs": [],
+      "source": [
+        "def predict(tokens):\n",
+        "    if isinstance(tokens, str):\n",
+        "        tokens = tokenizer.tokenize(tokens)\n",
+        "    token_ids = [tokenizer[token] for token in tokens]\n",
+        "    features = F.one_hot(ego.tensor([token_ids]),\n",
+        "                         num_classes=tokenizer.vocab_size)\n",
+        "    with ego.no_grad():\n",
+        "        logits = model(features)\n",
+        "    score, pred = logits[:, -1, :].softmax(dim=-1).topk(1)\n",
+        "    return {'token_id': [int(pred)],\n",
+        "            'token': [tokenizer[pred]], 'score': float(score)}\n",
+        "\n",
+        "\n",
+        "def sample(prompt: str, length=5):\n",
+        "    tokens = [char for char in prompt]\n",
+        "    length = length - len(tokens)\n",
+        "    for _ in range(length):\n",
+        "        pred_token = predict(tokens)['token']\n",
+        "        tokens.append(pred_token[0])\n",
+        "    return \"\".join(tokens)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a87a532a",
+      "metadata": {
+        "id": "a87a532a",
+        "outputId": "77814eb2-f9ce-4690-dfb2-10281de8c3f5"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "{'token_id': [7], 'token': ['y'], 'score': 0.9802330732345581}"
+            ]
+          },
+          "execution_count": 13,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "predict(\"he\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "7efb2f84",
+      "metadata": {
+        "id": "7efb2f84",
+        "outputId": "d3952dad-db66-4d48-ee7f-9e7e0fea5c07"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "good i am fine \n"
+          ]
+        }
+      ],
+      "source": [
+        "print(sample(\"good\", tokenizer.maxlen))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "fe42803a",
+      "metadata": {
+        "id": "fe42803a",
+        "outputId": "6d01871f-cf42-4091-bb91-22fa98f69f03"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "hey how are you\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(sample(\"hey\", tokenizer.maxlen))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c50a0097",
+      "metadata": {
+        "id": "c50a0097",
+        "outputId": "dc2f169d-04de-4583-dd51-1a3ab70bc671"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "have a nice day\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(sample(\"have\", tokenizer.maxlen))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "cf844af1",
+      "metadata": {
+        "id": "cf844af1",
+        "outputId": "0695753b-c001-4d12-acae-f1496b411623"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "hey i have youu i am \n"
+          ]
+        }
+      ],
+      "source": [
+        "print(sample(\"hey i have\", tokenizer.maxlen + 6))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a403e0cf",
+      "metadata": {
+        "id": "a403e0cf",
+        "outputId": "98828421-7c30-4fb9-eb64-749de74f8fdf"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "good i am fine and i am fine a nice\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(sample(\"good i am fine and\", tokenizer.maxlen + 20))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "0fe22588",
+      "metadata": {
+        "scrolled": true,
+        "id": "0fe22588",
+        "outputId": "7d824784-1d9f-47c5-e8f6-2bcb7b10cbf0"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0156 ms ((3, 14, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0083 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.2861 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0082 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.3554 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0269 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0592 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0211 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0461 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0264 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0115 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0233 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0160 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0079 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.2772 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0084 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.1313 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0217 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0420 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0191 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0374 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0212 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0094 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0155 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0150 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0076 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.1590 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0075 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.1783 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0198 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0446 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0193 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0389 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0210 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0095 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0154 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0150 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0110 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.3797 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0077 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.3453 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0248 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0477 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0246 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0383 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0229 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0105 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0155 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0162 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0102 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.2908 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0079 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.2809 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0231 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0467 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0190 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0385 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0224 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0102 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0156 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0149 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0076 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.8516 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0079 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.2770 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0204 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0413 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0187 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0382 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0223 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0102 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0161 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0148 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0077 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.5585 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0107 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.2777 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0274 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0401 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0182 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0382 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0205 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0092 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0147 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0142 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0071 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.6203 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0074 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.1616 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0204 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0465 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0180 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0357 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0199 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0096 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0148 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0147 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0076 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 1.0029 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0072 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.1903 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0193 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0406 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0180 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0364 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0204 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0096 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0151 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0140 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0072 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.7133 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0072 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.3172 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0191 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0397 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0178 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0365 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0212 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0095 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0151 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0141 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0070 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.5525 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0071 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.3067 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0188 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0396 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0188 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0377 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0216 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0098 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0290 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0157 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0077 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.8759 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0073 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.3185 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0200 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0404 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0182 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0362 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0210 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0094 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0150 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0141 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0070 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.8999 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0118 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.4159 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0338 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0729 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0299 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0589 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0312 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0154 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0285 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0272 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0165 ms ((96, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.1744 ms ((96,), (3, 19), (19, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0078 ms ((96, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.1335 ms ((96,), (3, 32), (32, 96), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0255 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0483 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Add (n_inp[2], n_out[1]) 0.0190 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Sigmoid (n_inp[1], n_out[1]) 0.0351 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0243 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          Tanh (n_inp[1], n_out[1]) 0.0162 ms ((3, 32),)\u001b[0m\n",
+            "\u001b[1m\u001b[36m                           Sub (n_inp[2], n_out[1]) 0.0177 ms ((3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Addcmul (n_inp[4], n_out[1]) 0.0217 ms ((3, 32), (3, 32), (3, 32), ('float',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                        Concat (n_inp[16], n_out[1]) 1.2842 ms (('int',), (3, 32), (3, 32), (3, 32), (3, 32), (3, 32), (3, 32), (3, 32), (3, 32), (3, 32), (3, 32), (3, 32), (3, 32), (3, 32), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          View (n_inp[2], n_out[1]) 0.0153 ms ((42, 32), ('tuple',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                        Concat (n_inp[3], n_out[1]) 0.0191 ms (('int',), (14, 3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                        Concat (n_inp[3], n_out[1]) 0.0139 ms (('int',), (3, 32))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          View (n_inp[2], n_out[1]) 0.0089 ms ((3, 32), ('tuple',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0095 ms ((14, 3, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                          View (n_inp[2], n_out[1]) 0.0160 ms ((3, 14, 32), ('tuple',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                     Transpose (n_inp[3], n_out[1]) 0.0074 ms ((19, 32), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                         Addmm (n_inp[5], n_out[1]) 0.6025 ms ((19,), (42, 32), (32, 19), ('int',), ('int',))\u001b[0m\n",
+            "\u001b[1m\u001b[36m                       Reshape (n_inp[2], n_out[1]) 0.0088 ms ((42, 19), (3, 14, 19))\u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Forward graph computation for input with batch_size 3\n",
+        "with ego.enable_profile():\n",
+        "    logits = model(x)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "32d2c8ed",
+      "metadata": {
+        "id": "32d2c8ed",
+        "outputId": "eb458ff0-e73c-4c8a-db3a-343abe7c74fb"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "tensor([[-0.1074,  0.1042, -0.5305, -1.6488, -3.4726,  2.2977,  1.0623,  1.2069,\n",
+              "         -1.5110,  1.4202, -0.3740,  2.7155, -0.7314,  8.2732,  0.5616, -1.2247,\n",
+              "         -3.5442, -2.0093, -0.8996],\n",
+              "        [-2.3912, -1.8889,  9.2165, -0.9056, -1.6622, -0.2876, -3.7539,  1.7835,\n",
+              "         -4.3580, -0.9843,  0.1061,  0.8117, -3.0761, -1.5708, -2.2181, -0.0827,\n",
+              "         -1.1877, -1.8860,  1.5904],\n",
+              "        [-2.5384, -1.8931,  1.7497, -0.5150, -0.3946, -2.7083, -0.3823,  7.4448,\n",
+              "         -0.7722, -3.3754,  0.9520,  0.1721,  1.2855, -0.6555, -2.6928,  0.0379,\n",
+              "         -2.8825,  0.6843, -3.4892]])"
+            ]
+          },
+          "execution_count": 20,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "logits[:, -1, :]  # raw token logits (dim=1) per batch (dim=0)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "913d2ca8",
+      "metadata": {
+        "id": "913d2ca8",
+        "outputId": "249a2eac-1176-414e-d990-c78d8a10f986"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "tensor([[2.2683e-04, 2.8028e-04, 1.4857e-04, 4.8559e-05, 7.8380e-06, 2.5133e-03,\n",
+              "         7.3064e-04, 8.4427e-04, 5.5734e-05, 1.0450e-03, 1.7374e-04, 3.8168e-03,\n",
+              "         1.2154e-04, 9.8933e-01, 4.4284e-04, 7.4210e-05, 7.2967e-06, 3.3864e-05,\n",
+              "         1.0272e-04],\n",
+              "        [9.0792e-06, 1.5004e-05, 9.9822e-01, 4.0107e-05, 1.8822e-05, 7.4410e-05,\n",
+              "         2.3239e-06, 5.9035e-04, 1.2703e-06, 3.7073e-05, 1.1031e-04, 2.2338e-04,\n",
+              "         4.5771e-06, 2.0623e-05, 1.0795e-05, 9.1336e-05, 3.0250e-05, 1.5047e-05,\n",
+              "         4.8669e-04],\n",
+              "        [4.5644e-05, 8.7029e-05, 3.3244e-03, 3.4527e-04, 3.8945e-04, 3.8515e-05,\n",
+              "         3.9426e-04, 9.8868e-01, 2.6695e-04, 1.9764e-05, 1.4971e-03, 6.8638e-04,\n",
+              "         2.0898e-03, 2.9999e-04, 3.9113e-05, 6.0017e-04, 3.2358e-05, 1.1456e-03,\n",
+              "         1.7640e-05]])"
+            ]
+          },
+          "execution_count": 21,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "scores = logits[:, -1, :].softmax(-1) # logits to token probabilities\n",
+        "scores"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "12e93e07",
+      "metadata": {
+        "id": "12e93e07",
+        "outputId": "1444b9a8-5846-4ebf-b85a-aa1e1d761f3b"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "ego.return_types.topk(\n",
+              "values=tensor([[0.9893],\n",
+              "        [0.9982],\n",
+              "        [0.9887]]),\n",
+              "indices=tensor([[13],\n",
+              "        [ 2],\n",
+              "        [ 7]]))"
+            ]
+          },
+          "execution_count": 22,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "scores.topk(1)  # predicted token likelihood(%) and token_id(n)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "1ccf2904",
+      "metadata": {
+        "id": "1ccf2904",
+        "outputId": "4c7c8c1b-748e-4e6f-9320-4b28b9a248e9"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "['u', ' ', 'y']"
+            ]
+          },
+          "execution_count": 23,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "next_tokens = tokenizer.decode(scores.topk(1).indices.view(-1))\n",
+        "next_tokens  # next token per batch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "bb5fa37c",
+      "metadata": {
+        "id": "bb5fa37c",
+        "outputId": "165456b3-9e1f-4607-8c7f-c47490f6e878"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "['hey how are yo', 'good i am fine', 'have a nice da']"
+            ]
+          },
+          "execution_count": 24,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "decoded_sentences = tokenizer.batch_decode(x.argmax(dim=-1), return_tokens=False)\n",
+        "decoded_sentences"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "61ccd821",
+      "metadata": {
+        "id": "61ccd821",
+        "outputId": "ad5d62e6-2dee-41c3-f44d-cd1e7d4e16cc"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "joined_sentence_next_token = 'hey how are you'\n",
+            "joined_sentence_next_token = 'good i am fine '\n",
+            "joined_sentence_next_token = 'have a nice day'\n"
+          ]
+        }
+      ],
+      "source": [
+        "# reconstructing the original sentences.\n",
+        "for sentence, next_token in zip(decoded_sentences, next_tokens):\n",
+        "    joined_sentence_next_token = sentence + next_token\n",
+        "    print(f\"{joined_sentence_next_token = }\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "7866e27c",
+      "metadata": {
+        "id": "7866e27c"
+      },
+      "outputs": [],
+      "source": [
+        ""
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.7"
+    },
+    "latex_envs": {
+      "LaTeX_envs_menu_present": true,
+      "autoclose": false,
+      "autocomplete": true,
+      "bibliofile": "biblio.bib",
+      "cite_by": "apalike",
+      "current_citInitial": 1,
+      "eqLabelWithNumbers": true,
+      "eqNumInitial": 1,
+      "hotkeys": {
+        "equation": "Ctrl-E",
+        "itemize": "Ctrl-I"
+      },
+      "labels_anchors": false,
+      "latex_user_defs": false,
+      "report_style_numbering": false,
+      "user_envs_cfg": false
+    },
+    "varInspector": {
+      "cols": {
+        "lenName": 16,
+        "lenType": 16,
+        "lenVar": 40
+      },
+      "kernels_config": {
+        "python": {
+          "delete_cmd_postfix": "",
+          "delete_cmd_prefix": "del ",
+          "library": "var_list.py",
+          "varRefreshCmd": "print(var_dic_list())"
+        },
+        "r": {
+          "delete_cmd_postfix": ") ",
+          "delete_cmd_prefix": "rm(",
+          "library": "var_list.r",
+          "varRefreshCmd": "cat(var_dic_list()) "
+        }
+      },
+      "types_to_exclude": [
+        "module",
+        "function",
+        "builtin_function_or_method",
+        "instance",
+        "_Feature"
+      ],
+      "window_display": false
+    },
+    "colab": {
+      "name": "Char2Char RNN with ego 🔥.ipynb",
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}