Created
October 14, 2014 21:39
-
-
Save xavivars/38ecea31809d72081a81 to your computer and use it in GitHub Desktop.
Revisions
-
xavivars created this gist
Oct 14, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,44 @@ #!/usr/bin/env python3 # vim: set ts=4 sw=4 sts=4 et : import sqlite3, re from datetime import datetime missingFreqsDBConn = None unknownMarkRE = re.compile(r'\*([^.,;:\t\* ]+)') def stripUnknownMarks(text): print "[ str: ", datetime.now() a = re.sub(unknownMarkRE, r'\1', text) print "] str: ", datetime.now() return a def noteUnknownTokens(text, pair): print "[ re: ", datetime.now() print pair, text for token in re.findall(unknownMarkRE, text): print "->re: ", datetime.now() noteUnknownToken(token, pair, 'this.db') print "] re: ", datetime.now() def noteUnknownToken(token, pair, dbPath): print "[ sql: ", datetime.now() global missingFreqsDBConn if not missingFreqsDBConn: missingFreqsDBConn = sqlite3.connect(dbPath) c = missingFreqsDBConn.cursor() c.execute('CREATE TABLE IF NOT EXISTS missingFreqs (pair TEXT, token TEXT, frequency INTEGER, UNIQUE(pair, token))') c.execute('INSERT OR REPLACE INTO missingFreqs VALUES (:pair, :token, COALESCE((SELECT frequency FROM missingFreqs WHERE pair=:pair AND token=:token), 0) + 1)', {'pair': pair, 'token': token}) missingFreqsDBConn.commit() print "] sql: ", datetime.now() longText = '*sampleWord *sampleWord *sampleWord *sampleWsord' if __name__ == '__main__': noteUnknownTokens(longText, 'spa-cat') print stripUnknownMarks(longText)