Skip to content

Instantly share code, notes, and snippets.

@hideaki-t
Last active July 30, 2016 14:42
Show Gist options
  • Save hideaki-t/81a94ef1e0895a97e7cc4fdf9250141a to your computer and use it in GitHub Desktop.
Save hideaki-t/81a94ef1e0895a97e7cc4fdf9250141a to your computer and use it in GitHub Desktop.

Revisions

  1. hideaki-t revised this gist Jul 30, 2016. 1 changed file with 25 additions and 25 deletions.
    50 changes: 25 additions & 25 deletions sqlite_spellfix1_python.ipynb
    Original file line number Diff line number Diff line change
    @@ -80,7 +80,7 @@
    {
    "data": {
    "text/plain": [
    "<sqlite3.Cursor at 0x7f19515181f0>"
    "<sqlite3.Cursor at 0x7f30840d01f0>"
    ]
    },
    "execution_count": 5,
    @@ -229,7 +229,7 @@
    },
    {
    "cell_type": "code",
    "execution_count": 12,
    "execution_count": 15,
    "metadata": {
    "collapsed": false
    },
    @@ -240,38 +240,38 @@
    "[]"
    ]
    },
    "execution_count": 12,
    "execution_count": 15,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute('UPDATE demo_vocab SET k1 = spellfix1_translit(word), k2=spellfix1_phonehash(spellfix1_translit(word))').fetchall()"
    "con.execute('UPDATE demo_vocab SET k1 = lower(spellfix1_translit(word)), k2=spellfix1_phonehash(lower(spellfix1_translit(word)))').fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 13,
    "execution_count": 16,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[(1, 1, 0, 'あいうえお', 'AIUEO', 'A'),\n",
    " (2, 1, 0, 'あえいおう', 'AEIOU', 'A'),\n",
    " (3, 1, 0, 'かきくけこ', 'KAKIKUKEKO', 'CACACACACA'),\n",
    " (4, 1, 0, 'AIUEO', 'AIUEO', 'A'),\n",
    " (5, 1, 0, 'KAKIKUKEKO', 'KAKIKUKEKO', 'CACACACACA'),\n",
    "[(1, 1, 0, 'あいうえお', 'aiueo', 'A'),\n",
    " (2, 1, 0, 'あえいおう', 'aeiou', 'A'),\n",
    " (3, 1, 0, 'かきくけこ', 'kakikukeko', 'CACACACACA'),\n",
    " (4, 1, 0, 'AIUEO', 'aiueo', 'A'),\n",
    " (5, 1, 0, 'KAKIKUKEKO', 'kakikukeko', 'CACACACACA'),\n",
    " (6, 1, 0, 'kennesaw', 'kennesaw', 'CAMACAB'),\n",
    " (7, 1, 0, 'kenosha', 'kenosha', 'CAMACA'),\n",
    " (8, 1, 0, 'kenesaw', 'kenesaw', 'CAMACAB'),\n",
    " (9, 1, 0, 'kenaga', 'kenaga', 'CAMACA'),\n",
    " (10, 1, 0, 'keanak', 'keanak', 'CAMAC')]"
    ]
    },
    "execution_count": 13,
    "execution_count": 16,
    "metadata": {},
    "output_type": "execute_result"
    }
    @@ -282,7 +282,7 @@
    },
    {
    "cell_type": "code",
    "execution_count": 14,
    "execution_count": 17,
    "metadata": {
    "collapsed": false
    },
    @@ -302,7 +302,7 @@
    " ('KAKIKUKEKO', 1, 300, 0, 331, 10)]"
    ]
    },
    "execution_count": 14,
    "execution_count": 17,
    "metadata": {},
    "output_type": "execute_result"
    }
    @@ -313,7 +313,7 @@
    },
    {
    "cell_type": "code",
    "execution_count": 15,
    "execution_count": 18,
    "metadata": {
    "collapsed": false
    },
    @@ -333,7 +333,7 @@
    " ('KAKIKUKEKO', 1, 300, 0, 331, 10)]"
    ]
    },
    "execution_count": 15,
    "execution_count": 18,
    "metadata": {},
    "output_type": "execute_result"
    }
    @@ -344,7 +344,7 @@
    },
    {
    "cell_type": "code",
    "execution_count": 16,
    "execution_count": 19,
    "metadata": {
    "collapsed": false
    },
    @@ -357,7 +357,7 @@
    " ('AIUEO', 1, 12, 0, 43, 5)]"
    ]
    },
    "execution_count": 16,
    "execution_count": 19,
    "metadata": {},
    "output_type": "execute_result"
    }
    @@ -368,20 +368,20 @@
    },
    {
    "cell_type": "code",
    "execution_count": 17,
    "execution_count": 20,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[('あいうえお', 1, 52, 0, 83, 5),\n",
    " ('あえいおう', 1, 52, 0, 83, 5),\n",
    "[('あえいおう', 1, 51, 0, 82, 5),\n",
    " ('あいうえお', 1, 52, 0, 83, 5),\n",
    " ('AIUEO', 1, 52, 0, 83, 5)]"
    ]
    },
    "execution_count": 17,
    "execution_count": 20,
    "metadata": {},
    "output_type": "execute_result"
    }
    @@ -392,7 +392,7 @@
    },
    {
    "cell_type": "code",
    "execution_count": 18,
    "execution_count": 21,
    "metadata": {
    "collapsed": false
    },
    @@ -409,7 +409,7 @@
    " ('KAKIKUKEKO', 1, 240, 0, 271, 10)]"
    ]
    },
    "execution_count": 18,
    "execution_count": 21,
    "metadata": {},
    "output_type": "execute_result"
    }
    @@ -420,7 +420,7 @@
    },
    {
    "cell_type": "code",
    "execution_count": 19,
    "execution_count": 22,
    "metadata": {
    "collapsed": false
    },
    @@ -437,7 +437,7 @@
    " ('KAKIKUKEKO', 1, 200, 0, 231, 10)]"
    ]
    },
    "execution_count": 19,
    "execution_count": 22,
    "metadata": {},
    "output_type": "execute_result"
    }
  2. hideaki-t created this gist Jul 30, 2016.
    480 changes: 480 additions & 0 deletions sqlite_spellfix1_python.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,480 @@
    {
    "cells": [
    {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {
    "collapsed": true
    },
    "outputs": [],
    "source": [
    "import sqlite3"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {
    "collapsed": true
    },
    "outputs": [],
    "source": [
    "con = sqlite3.connect(':memory:')"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[(None,)]"
    ]
    },
    "execution_count": 3,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.enable_load_extension(True)\n",
    "con.execute('select load_extension(\"spellfix.so\")').fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[]"
    ]
    },
    "execution_count": 4,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.executescript('''\n",
    "CREATE VIRTUAL TABLE demo USING spellfix1;\n",
    "CREATE VIRTUAL TABLE words USING fts4(word);\n",
    "''').fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "<sqlite3.Cursor at 0x7f19515181f0>"
    ]
    },
    "execution_count": 5,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.executemany(\"INSERT INTO words VALUES(?)\",\n",
    " [['あいうえお'], ['あえいおう'], ['かきくけこ'],\n",
    " ['AIUEO'], ['KAKIKUKEKO'],\n",
    " ['kennesaw'], ['kenosha'], ['kenesaw'], ['kenaga'], ['keanak']])"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[]"
    ]
    },
    "execution_count": 6,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute('INSERT INTO demo(word) SELECT word FROM words').fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[('あいうえお', 1, 4, 0, 35, 5),\n",
    " ('あえいおう', 1, 4, 0, 35, 5),\n",
    " ('かきくけこ', 1, 4, 0, 35, 5),\n",
    " ('AIUEO', 1, 107, 0, 138, 5),\n",
    " ('keanak', 1, 198, 0, 229, 6),\n",
    " ('kenosha', 1, 220, 0, 251, 7),\n",
    " ('kenaga', 1, 220, 0, 251, 6),\n",
    " ('kenesaw', 1, 245, 0, 276, 7),\n",
    " ('kennesaw', 1, 247, 0, 278, 8),\n",
    " ('KAKIKUKEKO', 1, 320, 0, 351, 10)]"
    ]
    },
    "execution_count": 7,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute(\"SELECT * FROM demo WHERE word MATCH 'あいう'\").fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[(1, 1, 0, 'あいうえお', '?????', ''),\n",
    " (2, 1, 0, 'あえいおう', '?????', ''),\n",
    " (3, 1, 0, 'かきくけこ', '?????', ''),\n",
    " (4, 1, 0, 'AIUEO', 'aiueo', 'A'),\n",
    " (5, 1, 0, 'KAKIKUKEKO', 'kakikukeko', 'CACACACACA'),\n",
    " (6, 1, 0, 'kennesaw', 'kennesaw', 'CAMACAB'),\n",
    " (7, 1, 0, 'kenosha', 'kenosha', 'CAMACA'),\n",
    " (8, 1, 0, 'kenesaw', 'kenesaw', 'CAMACAB'),\n",
    " (9, 1, 0, 'kenaga', 'kenaga', 'CAMACA'),\n",
    " (10, 1, 0, 'keanak', 'keanak', 'CAMAC')]"
    ]
    },
    "execution_count": 8,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute('SELECT * FROM demo_vocab').fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 9,
    "metadata": {
    "collapsed": false
    },
    "outputs": [],
    "source": [
    "import igo\n",
    "import csv\n",
    "from io import StringIO\n",
    "import unicodedata\n",
    "tagger = igo.tagger.Tagger()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 10,
    "metadata": {
    "collapsed": false
    },
    "outputs": [],
    "source": [
    "def maketrans(s):\n",
    " # KATAKANA LETTER [A]\n",
    " return str.maketrans({c:unicodedata.name(c).split()[2] for c in s})\n",
    "\n",
    "trans = maketrans('アイウエオカキクケコ')\n",
    "def my_spellfix1_translit(w):\n",
    " lines = StringIO()\n",
    " for m in tagger.parse(w):\n",
    " print(\"{},{}\".format(m.surface, m.feature), file=lines)\n",
    " lines.seek(0)\n",
    " reading = ''.join(x[8] if len(x) > 9 else x[0] for x in csv.reader(lines) if x)\n",
    " return reading.translate(trans)"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {
    "collapsed": false
    },
    "outputs": [],
    "source": [
    "con.create_function('spellfix1_translit', 1, my_spellfix1_translit)"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 12,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[]"
    ]
    },
    "execution_count": 12,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute('UPDATE demo_vocab SET k1 = spellfix1_translit(word), k2=spellfix1_phonehash(spellfix1_translit(word))').fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 13,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[(1, 1, 0, 'あいうえお', 'AIUEO', 'A'),\n",
    " (2, 1, 0, 'あえいおう', 'AEIOU', 'A'),\n",
    " (3, 1, 0, 'かきくけこ', 'KAKIKUKEKO', 'CACACACACA'),\n",
    " (4, 1, 0, 'AIUEO', 'AIUEO', 'A'),\n",
    " (5, 1, 0, 'KAKIKUKEKO', 'KAKIKUKEKO', 'CACACACACA'),\n",
    " (6, 1, 0, 'kennesaw', 'kennesaw', 'CAMACAB'),\n",
    " (7, 1, 0, 'kenosha', 'kenosha', 'CAMACA'),\n",
    " (8, 1, 0, 'kenesaw', 'kenesaw', 'CAMACAB'),\n",
    " (9, 1, 0, 'kenaga', 'kenaga', 'CAMACA'),\n",
    " (10, 1, 0, 'keanak', 'keanak', 'CAMAC')]"
    ]
    },
    "execution_count": 13,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute('SELECT * FROM demo_vocab').fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 14,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[('あいうえお', 1, 87, 0, 118, 5),\n",
    " ('あえいおう', 1, 87, 0, 118, 5),\n",
    " ('AIUEO', 1, 87, 0, 118, 5),\n",
    " ('keanak', 1, 178, 0, 209, 6),\n",
    " ('kenosha', 1, 200, 0, 231, 7),\n",
    " ('kenaga', 1, 200, 0, 231, 6),\n",
    " ('kenesaw', 1, 225, 0, 256, 7),\n",
    " ('kennesaw', 1, 227, 0, 258, 8),\n",
    " ('かきくけこ', 1, 300, 0, 331, 5),\n",
    " ('KAKIKUKEKO', 1, 300, 0, 331, 10)]"
    ]
    },
    "execution_count": 14,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute(\"SELECT * FROM demo WHERE word MATCH 'ア'\").fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 15,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[('あいうえお', 1, 87, 0, 118, 5),\n",
    " ('あえいおう', 1, 87, 0, 118, 5),\n",
    " ('AIUEO', 1, 87, 0, 118, 5),\n",
    " ('keanak', 1, 178, 0, 209, 6),\n",
    " ('kenosha', 1, 200, 0, 231, 7),\n",
    " ('kenaga', 1, 200, 0, 231, 6),\n",
    " ('kenesaw', 1, 225, 0, 256, 7),\n",
    " ('kennesaw', 1, 227, 0, 258, 8),\n",
    " ('かきくけこ', 1, 300, 0, 331, 5),\n",
    " ('KAKIKUKEKO', 1, 300, 0, 331, 10)]"
    ]
    },
    "execution_count": 15,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute(\"SELECT * FROM demo WHERE word MATCH 'あ'\").fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 16,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[('あいうえお', 1, 12, 0, 43, 5),\n",
    " ('あえいおう', 1, 12, 0, 43, 5),\n",
    " ('AIUEO', 1, 12, 0, 43, 5)]"
    ]
    },
    "execution_count": 16,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute(\"SELECT * FROM demo WHERE word MATCH 'A'\").fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 17,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[('あいうえお', 1, 52, 0, 83, 5),\n",
    " ('あえいおう', 1, 52, 0, 83, 5),\n",
    " ('AIUEO', 1, 52, 0, 83, 5)]"
    ]
    },
    "execution_count": 17,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute(\"SELECT * FROM demo WHERE word MATCH 'e'\").fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 18,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[('keanak', 1, 158, 0, 189, 6),\n",
    " ('kenosha', 1, 180, 0, 211, 7),\n",
    " ('kenaga', 1, 180, 0, 211, 6),\n",
    " ('kenesaw', 1, 205, 0, 236, 7),\n",
    " ('kennesaw', 1, 207, 0, 238, 8),\n",
    " ('かきくけこ', 1, 240, 0, 271, 5),\n",
    " ('KAKIKUKEKO', 1, 240, 0, 271, 10)]"
    ]
    },
    "execution_count": 18,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute(\"SELECT * FROM demo WHERE word MATCH 'ca'\").fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 19,
    "metadata": {
    "collapsed": false
    },
    "outputs": [
    {
    "data": {
    "text/plain": [
    "[('keanak', 1, 117, 0, 148, 6),\n",
    " ('kenosha', 1, 140, 0, 171, 7),\n",
    " ('kenaga', 1, 140, 0, 171, 6),\n",
    " ('kenesaw', 1, 165, 0, 196, 7),\n",
    " ('kennesaw', 1, 167, 0, 198, 8),\n",
    " ('かきくけこ', 1, 200, 0, 231, 5),\n",
    " ('KAKIKUKEKO', 1, 200, 0, 231, 10)]"
    ]
    },
    "execution_count": 19,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "con.execute(\"SELECT * FROM demo WHERE word MATCH 'ka'\").fetchall()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
    "collapsed": true
    },
    "outputs": [],
    "source": []
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.5.2"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 0
    }