Created
March 28, 2020 21:53
-
-
Save amoux/57f9e4bcb0e807300357ed96b3e42da4 to your computer and use it in GitHub Desktop.
Revisions
-
amoux revised this gist
Mar 28, 2020 . 1 changed file with 12 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -8,14 +8,25 @@ "collapsed_sections": [], "machine_shape": "hm", "mount_file_id": "1V0A6wSvlKTa5zayGpr_H_OOT5fSV4hyq", "authorship_tag": "ABX9TyM9718wavG3XZbmCQVjyL/b", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "<a href=\"https://colab.research.google.com/gist/amoux/57f9e4bcb0e807300357ed96b3e42da4/simple-node-graph-dawg-for-dna-sequences.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" ] }, { "cell_type": "markdown", "metadata": { -
amoux created this gist
Mar 28, 2020 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,674 @@ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Simple Node Graph (DAWG) for DNA Sequences.ipynb", "provenance": [], "collapsed_sections": [], "machine_shape": "hm", "mount_file_id": "1V0A6wSvlKTa5zayGpr_H_OOT5fSV4hyq", "authorship_tag": "ABX9TyM9718wavG3XZbmCQVjyL/b" }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "fgdJfi5G2YCf", "colab_type": "text" }, "source": [ "# Node Graph\n", "\n", "> Simple implementation of a ***Directed Acyclic Word Graph (`DAWG`)*** for dna sequence autocompletion.\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "AmbZDFFk6a3E", "colab_type": "code", "colab": {} }, "source": [ "from typing import List\n", "from pathlib import Path\n", "import json\n", "import collections" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "oDqJITx36g3e", "colab_type": "code", "colab": {} }, "source": [ "class Graph(object):\n", " def __init__(self):\n", " self.children = {}\n", " self.token = None\n", "\n", " def insert(self, token: str, add_token=True):\n", " node = self\n", " for char in token:\n", " if not char in node.children:\n", " node.children[char] = Graph()\n", " node = node.children[char]\n", " if add_token:\n", " node.token = token\n", " return node\n", "\n", " def transverse(self, query: str):\n", " node = self\n", " for char in query:\n", " child = node.children.get(char)\n", " if child:\n", " node = child\n", " else:\n", " break\n", " return node\n", "\n", " def descendant_nodes(self):\n", " deque = collections.deque()\n", " for char, child_node in self.children.items():\n", " deque.append((char, child_node))\n", "\n", " while deque:\n", " char, child_node = deque.popleft()\n", " if child_node.token:\n", " yield child_node\n", "\n", " for char, grand_child_node in child_node.children.items():\n", " deque.append((char, grand_child_node))\n", " \n", " def __repr__(self):\n", " return f'<Graph({self.token}, window={list(self.children.keys())})>'" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "LGaQEoA88MKq", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "0840b3e2-7055-4078-b9e4-387e40057030" }, "source": [ "covid_seqs_dir = Path('/content/drive/My Drive/covid-19/covid-sequences/data/')\n", "\n", "covid_seqs = {}\n", "for jsonfile in covid_seqs_dir.glob(\"*.json\"):\n", " name = jsonfile.name.replace(jsonfile.suffix, \"\")\n", " with jsonfile.open(\"r\") as file:\n", " covid_seqs[name] = json.load(file)\n", "\n", "covid_seqs.keys()" ], "execution_count": 3, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "dict_keys(['fluseq', 'sarseq', 'covseq'])" ] }, "metadata": { "tags": [] }, "execution_count": 3 } ] }, { "cell_type": "code", "metadata": { "id": "ctvIPfSQ-Anr", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 54 }, "outputId": "ed5fcba7-a98b-4e17-dc32-b3b5a33d4382" }, "source": [ "sequences_index = {}\n", "for key in covid_seqs.keys():\n", " for seqid, sequence in covid_seqs[key].items():\n", " if seqid not in sequences_index:\n", " sequences_index[seqid] = sequence\n", "\n", "sequences_index['AF389116']" ], "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'AGCGAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACCTTACTTTTCTTAAAAGTGCCAGCACAAAATGCTATAAGCACAACTTTCCCTTATACTGGAGACCCTCCTTACAGCCATGGGACAGGAACAGGATACACCATGGATACTGTCAACAGGACACATCAGTACTCAGAAAAGGGAAGATGGACAACAAACACCGAAACTGGAGCACCGCAACTCAACCCGATTGATGGGCCACTGCCAGAAGACAATGAACCAAGTGGTTATGCCCAAACAGATTGTGTATTGGAAGCAATGGCTTTCCTTGAGGAATCCCATCCTGGTATTTTTGAAAACTCGTGTATTGAAACGATGGAGGTTGTTCAGCAAACACGAGTAGACAAGCTGACACAAGGCCGACAGACCTATGACTGGACTCTAAATAGAAACCAACCTGCTGCAACAGCATTGGCCAACACAATAGAAGTGTTCAGATCAAATGGCCTCACGGCCAATGAGTCTGGAAGGCTCATAGACTTCCTTAAGGATGTAATGGAGTCAATGAAAAAAGAAGAAATGGGGATCACAACTCATTTTCAGAGAAAGAGACGGGTGAGAGACAATATGACTAAGAAAATGATAACACAGAGAACAATAGGTAAAAAGAAGCAGAGATTGAACAAAAGGAGTTATCTAATTAGAGCATTGACCCTGAACACAATGACCAAAGATGCTGAGAGAGGGAAGCTAAAACGGAGAGCAATTGCAACCCCAGGGATGCAAATAAGGGGGTTTGTATACTTTGTTGAGACACTGGCAAGGAGTATATGTGAGAAACTTGAACAATCAGGGTTGCCAGTTGGAGGCAATGAGAAGAAAGCAAAGTTGGCAAATGTTGTAAGGAAGATGATGACCAATTCTCAGGACACCGAACTTTCTTTCACCATCACTGGAGATAACACCAAATGGAACGAAAATCAGAATCCTCGGATGTTTTTGGCCATGATCACATATATGACAAGAAATCAGCCCGAATGGTTCAGAAATGTTCTAAGTATTGCTCCAATAATGTTCTCAAACAAAATGGCGAGACTGGGAAAAGGGTATATGTTTGAGAGCAAGAGTATGAAACTTAGAACTCAAATACCTGCAGAAATGCTAGCAAGCATCGATTTGAAATATTTCAATGATTCAACAAGAAAGAAGATTGAAAAAATCCGACCGCTCTTAATAGAGGGGACTGCATCATTGAGCCCTGGAATGATGATGGGCATGTTCAATATGTTAAGCACTGTATTAGGCGTCTCCATCCTGAATCTTGGACAAAAGAGATACACCAAGACTACTTACTGGTGGGATGGTCTTCAATCCTCTGACGATTTTGCTCTGATTGTGAATGCACCCAATCATGAAGGGATTCAAGCCGGAGTCGACAGGTTTTATCGAACCTGTAAGCTACTTGGAATCAATATGAGCAAGAAAAAGTCTTACATAAACAGAACAGGTACATTTGAATTCACAAGTTTTTTCTATCGTTATGGGTTTGTTGCCAATTTCAGCATGGAGCTCCCCAGTTTTGGGGTGTCTGGGATCAACGAGTCAGCGGACATGAGTATTGGAGTTACTGTCATCAAAAACAATATGATAAACAATGATCTTGGTCCAGCAACAGCTCAAATGGCCCTTCAGTTGTTCATCAAAGATTACAGGTACACGTACCGATGCCATAGAGGTGACACACAAATACAAACCCGAAGATCATTTGAAATAAAGAAACTGTGGGAGCAAACCCGTTCCAAAGCTGGACTGCTGGTCTCCGACGGAGGCCCAAATTTATACAACATTAGAAATCTCCACATTCCTGAAGTCTGCCTAAAATGGGAATTGATGGATGAGGATTACCAGGGGCGTTTATGCAACCCACTGAACCCATTTGTCAGCCATAAAGAAATTGAATCAATGAACAATGCAGTGATGATGCCAGCACATGGTCCAGCCAAAAACATGGAGTATGATGCTGTTGCAACAACACACTCCTGGATCCCCAAAAGAAATCGATCCATCTTGAATACAAGTCAAAGAGGAGTACTTGAAGATGAACAAATGTACCAAAGGTGCTGCAATTTATTTGAAAAATTCTTCCCCAGCAGTTCATACAGAAGACCAGTCGGGATATCCAGTATGGTGGAGGCTATGGTTTCCAGAGCCCGAATTGATGCACGGATTGATTTCGAATCTGGAAGGATAAAGAAAGAAGAGTTCACTGAGATCATGAAGATCTGTTCCACCATTGAAGAGCTCAGACGGCAAAAATAGTGAATTTAGCTTGTCCTTCATGAAAAAATGCCTTGTTTCTACT'" ] }, "metadata": { "tags": [] }, "execution_count": 4 } ] }, { "cell_type": "code", "metadata": { "id": "rxAxFDcfDIy-", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 194 }, "outputId": "92c269e6-9d75-41cc-d07e-5eab98009ee7" }, "source": [ "vocab = {}\n", "for sequence in sequences_index.values():\n", " for char in sequence:\n", " if char not in vocab:\n", " vocab[char] = 1\n", " else:\n", " vocab[char] += 1\n", "\n", "vocab_counts = collections.Counter(vocab)\n", "vocab_counts.most_common(10)" ], "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[('T', 35674146),\n", " ('A', 31873941),\n", " ('G', 26194090),\n", " ('C', 22849345),\n", " ('N', 9662),\n", " ('Y', 1547),\n", " ('R', 1083),\n", " ('K', 333),\n", " ('W', 286),\n", " ('M', 283)]" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "code", "metadata": { "id": "nCB0KmDdFUkj", "colab_type": "code", "colab": {} }, "source": [ "vocab_index = {}\n", "index_vocab = {}\n", "for index, (key, count) in enumerate(vocab_counts.most_common()):\n", " vocab_index[key] = index\n", " index_vocab[index] = key\n", "\n", "\n", "def encode(sequence: str) -> List[int]:\n", " return [vocab_index[char] for char in sequence if char in vocab_index]\n", "\n", "def decode(sequence_ids) -> List[str]:\n", " return [index_vocab[idx] for idx in sequence_ids if idx in index_vocab]" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "AXmQR_ghGc4J", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 52 }, "outputId": "2207577e-3dbe-4fb2-ac4e-5e877beddfb6" }, "source": [ "sequence_ids = encode(sequences_index[\"AF389116\"])\n", "sequence = decode(sequence_ids)\n", "\n", "print(f\"sequence-ids: {sequence_ids[:20]}\")\n", "print(f\"sequence: {sequence[:20]}\")" ], "execution_count": 7, "outputs": [ { "output_type": "stream", "text": [ "sequence-ids: [1, 2, 3, 2, 1, 1, 1, 2, 3, 1, 2, 2, 3, 1, 1, 1, 3, 3, 1, 0]\n", "sequence: ['A', 'G', 'C', 'G', 'A', 'A', 'A', 'G', 'C', 'A', 'G', 'G', 'C', 'A', 'A', 'A', 'C', 'C', 'A', 'T']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "oboeSDdCHTLx", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "cd2ab426-a129-474a-fff7-fb2896059bae" }, "source": [ "print(encode('ATGGAAGTATTTAAAGCGCCACCTATTGGGATATAAG'))" ], "execution_count": 8, "outputs": [ { "output_type": "stream", "text": [ "[1, 0, 2, 2, 1, 1, 2, 0, 1, 0, 0, 0, 1, 1, 1, 2, 3, 2, 3, 3, 1, 3, 3, 0, 1, 0, 0, 2, 2, 2, 1, 0, 1, 0, 1, 1, 2]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "_PUz0utHIbIb", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "939c352c-a188-4be5-8874-93664596c87a" }, "source": [ "sequences = []\n", "for sequence in sequences_index.values():\n", " sequence_ids = encode(sequence)\n", " sequences.append(sequence_ids)\n", "len(sequences)" ], "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "15177" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "id": "zdBpmf9noHvt", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 194 }, "outputId": "05796ef4-3049-428a-dcc3-ea29310a79dd" }, "source": [ "import random\n", "\n", "num_steps = 3 # ngram size\n", "maxsize = 4000 # maxsize up to 15,177\n", "\n", "dna_pairs = {}\n", "random.shuffle(sequences)\n", "for sequence in sequences[:maxsize]:\n", " for i in range(0, len(sequence), num_steps):\n", " seq_ids = sequence[i: i + num_steps]\n", " seq = \"\".join(decode(seq_ids))\n", " if seq not in dna_pairs:\n", " dna_pairs[seq] = 1\n", " else:\n", " dna_pairs[seq] += 1\n", "\n", "common_pairs = collections.Counter(dna_pairs)\n", "common_pairs.most_common(10)" ], "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[('TTT', 302667),\n", " ('TTG', 274921),\n", " ('TGT', 267892),\n", " ('ATG', 254791),\n", " ('ATT', 233247),\n", " ('GTT', 232533),\n", " ('AAA', 231146),\n", " ('TTA', 219465),\n", " ('AAT', 217561),\n", " ('TGG', 212646)]" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "cell_type": "code", "metadata": { "id": "3Erk5NR-Fr_5", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "d55047e4-0480-4aac-aa1a-2d0571460ac3" }, "source": [ "len(common_pairs.keys())" ], "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "426" ] }, "metadata": { "tags": [] }, "execution_count": 11 } ] }, { "cell_type": "markdown", "metadata": { "id": "K3F8sUKM5McD", "colab_type": "text" }, "source": [ "## Autocomplete Demo\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "LFwBohU5vz9w", "colab_type": "code", "colab": {} }, "source": [ "def iter_dna_pairs(maxsize=5000, window=3):\n", " for sequence in sequences[:maxsize]:\n", " for i in range(0, len(sequence), window):\n", " sequence_ids = sequence[i: i + window]\n", " sequence = \"\".join(decode(sequence_ids))\n", " yield sequence" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "2jct4paAxqx2", "colab_type": "code", "colab": {} }, "source": [ "graph = Graph()\n", "for pairs in iter_dna_pairs(window=3):\n", " graph.insert(pairs)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "MhFGyejwx5bz", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "f02db152-0c1d-4b11-b4df-96df874d243d" }, "source": [ "graph.transverse(query=\"AT\")" ], "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "<Graph(None, window=['G', 'T', 'C', 'A'])>" ] }, "metadata": { "tags": [] }, "execution_count": 14 } ] }, { "cell_type": "code", "metadata": { "id": "C7qmtXa6x6kY", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 87 }, "outputId": "fd0f4523-01c2-4014-9e74-b2c0677374f7" }, "source": [ "dna_node = graph.transverse(\"AT\")\n", "list(dna_node.descendant_nodes())" ], "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[<Graph(ATG, window=[])>,\n", " <Graph(ATT, window=[])>,\n", " <Graph(ATC, window=[])>,\n", " <Graph(ATA, window=[])>]" ] }, "metadata": { "tags": [] }, "execution_count": 15 } ] }, { "cell_type": "code", "metadata": { "id": "SrlUYT2703g4", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 87 }, "outputId": "99364c47-c131-415d-91aa-50db33cbf7ee" }, "source": [ "dna_node = graph.transverse(\"AT\")\n", "list(dna_node.descendant_nodes())" ], "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[<Graph(ATG, window=[])>,\n", " <Graph(ATT, window=[])>,\n", " <Graph(ATC, window=[])>,\n", " <Graph(ATA, window=[])>]" ] }, "metadata": { "tags": [] }, "execution_count": 16 } ] }, { "cell_type": "markdown", "metadata": { "id": "skiMO7YP32c0", "colab_type": "text" }, "source": [ "> Here the size of the window is increased by `6` steps." ] }, { "cell_type": "code", "metadata": { "id": "ciljMMpx1jog", "colab_type": "code", "colab": {} }, "source": [ "graph = Graph()\n", "for pairs in iter_dna_pairs(maxsize=6000, window=6):\n", " graph.insert(pairs)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "fiUfHL1F3sKC", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "5a1bf1a5-0a17-4a90-8a99-f4a40cd68711" }, "source": [ "graph.transverse(\"ATGG\")" ], "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "<Graph(None, window=['A', 'C', 'G', 'T'])>" ] }, "metadata": { "tags": [] }, "execution_count": 18 } ] }, { "cell_type": "code", "metadata": { "id": "Qo4Z70CP3vBZ", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 230 }, "outputId": "9ccdef41-5b76-4f19-d80c-0b4db20d753c" }, "source": [ "dna_node = graph.transverse(\"ATGG\")\n", "list(dna_node.descendant_nodes())" ], "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[<Graph(ATGGAC, window=[])>,\n", " <Graph(ATGGAT, window=[])>,\n", " <Graph(ATGGAG, window=[])>,\n", " <Graph(ATGGAA, window=[])>,\n", " <Graph(ATGGCT, window=[])>,\n", " <Graph(ATGGCG, window=[])>,\n", " <Graph(ATGGCA, window=[])>,\n", " <Graph(ATGGCC, window=[])>,\n", " <Graph(ATGGGA, window=[])>,\n", " <Graph(ATGGGG, window=[])>,\n", " <Graph(ATGGGC, window=[])>,\n", " <Graph(ATGGTA, window=[])>]" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] }, { "cell_type": "code", "metadata": { "id": "ly3mK9OCBYHG", "colab_type": "code", "colab": {} }, "source": [ "" ], "execution_count": 0, "outputs": [] } ] }