Last active
July 30, 2016 14:42
-
-
Save hideaki-t/81a94ef1e0895a97e7cc4fdf9250141a to your computer and use it in GitHub Desktop.
Revisions
-
hideaki-t revised this gist
Jul 30, 2016 . 1 changed file with 25 additions and 25 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -80,7 +80,7 @@ { "data": { "text/plain": [ "<sqlite3.Cursor at 0x7f30840d01f0>" ] }, "execution_count": 5, @@ -229,7 +229,7 @@ }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, @@ -240,38 +240,38 @@ "[]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute('UPDATE demo_vocab SET k1 = lower(spellfix1_translit(word)), k2=spellfix1_phonehash(lower(spellfix1_translit(word)))').fetchall()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 1, 0, 'あいうえお', 'aiueo', 'A'),\n", " (2, 1, 0, 'あえいおう', 'aeiou', 'A'),\n", " (3, 1, 0, 'かきくけこ', 'kakikukeko', 'CACACACACA'),\n", " (4, 1, 0, 'AIUEO', 'aiueo', 'A'),\n", " (5, 1, 0, 'KAKIKUKEKO', 'kakikukeko', 'CACACACACA'),\n", " (6, 1, 0, 'kennesaw', 'kennesaw', 'CAMACAB'),\n", " (7, 1, 0, 'kenosha', 'kenosha', 'CAMACA'),\n", " (8, 1, 0, 'kenesaw', 'kenesaw', 'CAMACAB'),\n", " (9, 1, 0, 'kenaga', 'kenaga', 'CAMACA'),\n", " (10, 1, 0, 'keanak', 'keanak', 'CAMAC')]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -282,7 +282,7 @@ }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, @@ -302,7 +302,7 @@ " ('KAKIKUKEKO', 1, 300, 0, 331, 10)]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -313,7 +313,7 @@ }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, @@ -333,7 +333,7 @@ " ('KAKIKUKEKO', 1, 300, 0, 331, 10)]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -344,7 +344,7 @@ }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, @@ -357,7 +357,7 @@ " ('AIUEO', 1, 12, 0, 43, 5)]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -368,20 +368,20 @@ }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('あえいおう', 1, 51, 0, 82, 5),\n", " ('あいうえお', 1, 52, 0, 83, 5),\n", " ('AIUEO', 1, 52, 0, 83, 5)]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -392,7 +392,7 @@ }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, @@ -409,7 +409,7 @@ " ('KAKIKUKEKO', 1, 240, 0, 271, 10)]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -420,7 +420,7 @@ }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, @@ -437,7 +437,7 @@ " ('KAKIKUKEKO', 1, 200, 0, 231, 10)]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } -
hideaki-t created this gist
Jul 30, 2016 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,480 @@ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import sqlite3" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "con = sqlite3.connect(':memory:')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(None,)]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.enable_load_extension(True)\n", "con.execute('select load_extension(\"spellfix.so\")').fetchall()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.executescript('''\n", "CREATE VIRTUAL TABLE demo USING spellfix1;\n", "CREATE VIRTUAL TABLE words USING fts4(word);\n", "''').fetchall()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "<sqlite3.Cursor at 0x7f19515181f0>" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.executemany(\"INSERT INTO words VALUES(?)\",\n", " [['あいうえお'], ['あえいおう'], ['かきくけこ'],\n", " ['AIUEO'], ['KAKIKUKEKO'],\n", " ['kennesaw'], ['kenosha'], ['kenesaw'], ['kenaga'], ['keanak']])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute('INSERT INTO demo(word) SELECT word FROM words').fetchall()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('あいうえお', 1, 4, 0, 35, 5),\n", " ('あえいおう', 1, 4, 0, 35, 5),\n", " ('かきくけこ', 1, 4, 0, 35, 5),\n", " ('AIUEO', 1, 107, 0, 138, 5),\n", " ('keanak', 1, 198, 0, 229, 6),\n", " ('kenosha', 1, 220, 0, 251, 7),\n", " ('kenaga', 1, 220, 0, 251, 6),\n", " ('kenesaw', 1, 245, 0, 276, 7),\n", " ('kennesaw', 1, 247, 0, 278, 8),\n", " ('KAKIKUKEKO', 1, 320, 0, 351, 10)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"SELECT * FROM demo WHERE word MATCH 'あいう'\").fetchall()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 1, 0, 'あいうえお', '?????', ''),\n", " (2, 1, 0, 'あえいおう', '?????', ''),\n", " (3, 1, 0, 'かきくけこ', '?????', ''),\n", " (4, 1, 0, 'AIUEO', 'aiueo', 'A'),\n", " (5, 1, 0, 'KAKIKUKEKO', 'kakikukeko', 'CACACACACA'),\n", " (6, 1, 0, 'kennesaw', 'kennesaw', 'CAMACAB'),\n", " (7, 1, 0, 'kenosha', 'kenosha', 'CAMACA'),\n", " (8, 1, 0, 'kenesaw', 'kenesaw', 'CAMACAB'),\n", " (9, 1, 0, 'kenaga', 'kenaga', 'CAMACA'),\n", " (10, 1, 0, 'keanak', 'keanak', 'CAMAC')]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute('SELECT * FROM demo_vocab').fetchall()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import igo\n", "import csv\n", "from io import StringIO\n", "import unicodedata\n", "tagger = igo.tagger.Tagger()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def maketrans(s):\n", " # KATAKANA LETTER [A]\n", " return str.maketrans({c:unicodedata.name(c).split()[2] for c in s})\n", "\n", "trans = maketrans('アイウエオカキクケコ')\n", "def my_spellfix1_translit(w):\n", " lines = StringIO()\n", " for m in tagger.parse(w):\n", " print(\"{},{}\".format(m.surface, m.feature), file=lines)\n", " lines.seek(0)\n", " reading = ''.join(x[8] if len(x) > 9 else x[0] for x in csv.reader(lines) if x)\n", " return reading.translate(trans)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "con.create_function('spellfix1_translit', 1, my_spellfix1_translit)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute('UPDATE demo_vocab SET k1 = spellfix1_translit(word), k2=spellfix1_phonehash(spellfix1_translit(word))').fetchall()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 1, 0, 'あいうえお', 'AIUEO', 'A'),\n", " (2, 1, 0, 'あえいおう', 'AEIOU', 'A'),\n", " (3, 1, 0, 'かきくけこ', 'KAKIKUKEKO', 'CACACACACA'),\n", " (4, 1, 0, 'AIUEO', 'AIUEO', 'A'),\n", " (5, 1, 0, 'KAKIKUKEKO', 'KAKIKUKEKO', 'CACACACACA'),\n", " (6, 1, 0, 'kennesaw', 'kennesaw', 'CAMACAB'),\n", " (7, 1, 0, 'kenosha', 'kenosha', 'CAMACA'),\n", " (8, 1, 0, 'kenesaw', 'kenesaw', 'CAMACAB'),\n", " (9, 1, 0, 'kenaga', 'kenaga', 'CAMACA'),\n", " (10, 1, 0, 'keanak', 'keanak', 'CAMAC')]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute('SELECT * FROM demo_vocab').fetchall()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('あいうえお', 1, 87, 0, 118, 5),\n", " ('あえいおう', 1, 87, 0, 118, 5),\n", " ('AIUEO', 1, 87, 0, 118, 5),\n", " ('keanak', 1, 178, 0, 209, 6),\n", " ('kenosha', 1, 200, 0, 231, 7),\n", " ('kenaga', 1, 200, 0, 231, 6),\n", " ('kenesaw', 1, 225, 0, 256, 7),\n", " ('kennesaw', 1, 227, 0, 258, 8),\n", " ('かきくけこ', 1, 300, 0, 331, 5),\n", " ('KAKIKUKEKO', 1, 300, 0, 331, 10)]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"SELECT * FROM demo WHERE word MATCH 'ア'\").fetchall()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('あいうえお', 1, 87, 0, 118, 5),\n", " ('あえいおう', 1, 87, 0, 118, 5),\n", " ('AIUEO', 1, 87, 0, 118, 5),\n", " ('keanak', 1, 178, 0, 209, 6),\n", " ('kenosha', 1, 200, 0, 231, 7),\n", " ('kenaga', 1, 200, 0, 231, 6),\n", " ('kenesaw', 1, 225, 0, 256, 7),\n", " ('kennesaw', 1, 227, 0, 258, 8),\n", " ('かきくけこ', 1, 300, 0, 331, 5),\n", " ('KAKIKUKEKO', 1, 300, 0, 331, 10)]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"SELECT * FROM demo WHERE word MATCH 'あ'\").fetchall()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('あいうえお', 1, 12, 0, 43, 5),\n", " ('あえいおう', 1, 12, 0, 43, 5),\n", " ('AIUEO', 1, 12, 0, 43, 5)]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"SELECT * FROM demo WHERE word MATCH 'A'\").fetchall()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('あいうえお', 1, 52, 0, 83, 5),\n", " ('あえいおう', 1, 52, 0, 83, 5),\n", " ('AIUEO', 1, 52, 0, 83, 5)]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"SELECT * FROM demo WHERE word MATCH 'e'\").fetchall()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('keanak', 1, 158, 0, 189, 6),\n", " ('kenosha', 1, 180, 0, 211, 7),\n", " ('kenaga', 1, 180, 0, 211, 6),\n", " ('kenesaw', 1, 205, 0, 236, 7),\n", " ('kennesaw', 1, 207, 0, 238, 8),\n", " ('かきくけこ', 1, 240, 0, 271, 5),\n", " ('KAKIKUKEKO', 1, 240, 0, 271, 10)]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"SELECT * FROM demo WHERE word MATCH 'ca'\").fetchall()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('keanak', 1, 117, 0, 148, 6),\n", " ('kenosha', 1, 140, 0, 171, 7),\n", " ('kenaga', 1, 140, 0, 171, 6),\n", " ('kenesaw', 1, 165, 0, 196, 7),\n", " ('kennesaw', 1, 167, 0, 198, 8),\n", " ('かきくけこ', 1, 200, 0, 231, 5),\n", " ('KAKIKUKEKO', 1, 200, 0, 231, 10)]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"SELECT * FROM demo WHERE word MATCH 'ka'\").fetchall()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }