#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright © 2018 seamus tuohy, # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. # YOU WILL NEED THIS # http://regexhero.net/reference/ # This creates a powershell compatible regular expression that you can use to check for homoglyphs of a specific string. import argparse import re import logging logging.basicConfig(level=logging.ERROR) log = logging.getLogger(__name__) def main(): args = parse_arguments() set_logging(args.verbose, args.debug) # homoglyphs = get_homoglyphs() # regex = create_regex(args.string, homoglyphs) if args.puny_only is True: homoglyphs = get_puny_homoglyphs() else: homoglyphs = get_homoglyphs() # print(homoglyphs) nm_homoglyphs = make_homoglyph_named_matches(homoglyphs) #print(nm_homoglyphs) #print("") regex = create_nm_regex(args.string, nm_homoglyphs) #print("") print('"{0}"'.format(regex)) def make_homoglyph_named_matches(homoglyphs): nm = {} for letter,strings in homoglyphs.items(): current_hg = set(nm.get(letter.lower(), [])) for i in strings: current_hg.add(i) nm[letter.lower()] = current_hg nm_define = '(?<{0}>[{1}])' nm_regex = {} for letter,strings in nm.items(): nm_regex[letter] = nm_define.format(letter, ''.join(strings)) return nm_regex def get_regex_rejecting_name_match(string): name_regex = " ".join(["[{0}{1}]{2}".format(i[0].upper(), i[0].lower(), i[1:]) for i in string.split()]) return "(?!{0})".format(name_regex) def create_nm_regex(string, nm_homoglyphs): homo_regex = "" # \p{name} :: Matches any single character in the Unicode general category or named block specified by name. # https://www.regular-expressions.info/unicode.html # (?pattern) :: Defines a balancing group definition. whitespace_define = '(?[\p{M}\p{Z}\p{P}\p{C}]*)' # \k :: Named backreference. Matches the value of a named expression. whitespace_checks = "\k" nm_hg_checks = "\k<{0}>" letter_set = set() for i in string: # Define the character string upon first use if not re.match("[a-zA-Z]", i): # Don't replace non-ascii match_set = i elif i.lower() in letter_set: match_set = nm_hg_checks.format(i.lower()) else: match_set = nm_homoglyphs.get(i.lower(), None) letter_set.add(i.lower()) if match_set is None: match_set = i homo_regex = "{0}{1}{2}".format(homo_regex, whitespace_checks, match_set) regex_not_original_name = get_regex_rejecting_name_match(string) homo_regex = regex_not_original_name + whitespace_define + homo_regex # Remove multi-space checks for beginning and end of string homo_regex = homo_regex.replace('\k \k', '\k ') return homo_regex def create_regex(string, homoglyphs): homo_regex = "" whitespace_define = '(?[\p{M}\p{Z}\p{P}\p{C}]*)' whitespace_checks = "\k" for i in string: all_letter_variations = list(set(homoglyphs.get(i.lower(), []) + homoglyphs.get(i.upper(), []))) match_set = "[{0}]".format(''.join(all_letter_variations)) if match_set != []: homo_regex = "{0}{1}{2}".format(homo_regex, whitespace_checks, match_set) else: homo_regex = "{0}{1}{2}".format(homo_regex, whitespace_checks, i) homo_regex = whitespace_define + homo_regex return homo_regex def get_puny_homoglyphs(): initial_homoglyphs = get_homoglyphs() puny_homoglyphs = {} for i,x in initial_homoglyphs.items(): _ph = set([i.encode('idna').decode('idna') for i in x]) puny_homoglyphs[i] = ''.join(list(_ph)) return puny_homoglyphs def get_homoglyphs(): homoglyph_strings = { " ": "\s", "A": "𝗔𝖠𝙰𝘈A𝘼𝜜ꭺᗅ𝒜ꓮ𝔸Ꭺ𝓐𝚨ÅÁ𝔄𝝖𝐴À𝞐𐊠ᴀÂ𝐀ÃА𖽀𝑨𝕬𝛢AÄΑ", "B": "𝛣𝞑ᗷß𝗕ꞴB𝖡𝜝𐊡Β𝑩𝔹𝓑𝔅в𝘉ᛒ𝐵𝙱𝝗ꓐВ𝐁ᏼ𝚩ℬBβ𐌁𐊂𝘽ʙ𝕭Ᏼ", "C": "Ⲥ𝑪𑣩🝌Cℭ𝙲𝒞ꓚ𝓒Ꮯ𐊢ℂ𝐶C𐔜𝗖𝐂Ⅽ𐌂𑣲С𝘾𝘊𺀠𝖢𝕮Ϲ𐐕𐐠", "D": "𝓓𝗗ᗞ𝔻𝘿Đᗪ𝐷𝙳𝖣𝒟ĎꓓDⅅⅮ𝕯𝔇ᴅ𝐃𝑫𝘋DᎠꭰ", "E": "𝙀ÈĚ𝔈Éᴇ𝘌𝔼Е𑢦𝜠Ēℰ⋿𝝚ĔΕË𝛦𝑬𝚬𝗘𝞔ꭼĖE𝕰EĘ𑢮𝖤𝙴𐊆𝓔𝐸ꓰÊ𝐄Ꭼⴹ", "F": "𝙁𝔽𝑭F𝙵ꓝ𝐹ᖴ𝟊𐊇𝐅𝈓Ꞙ𝕱𝔉𝓕𝖥ℱ𝗙𑢢F𑣂𐔥𐊥Ϝ𝘍", "G": "𝘎Gԍ𝗚ɢ𝐺𝔾𝙂𝑮𝕲Ꮐնꮐ𝒢ᏻꓖԌ𝓖G𝔊𝐆𝙶𝖦Ᏻ", "H": "𝞖𝐇𝝜𝗛ℍ𝛨𝘏Ⲏ𝖧𝜢𝙷ꓧһнᎻℋꮋ𝚮Hᕼ𝓗𝐻𝑯ʜ𝙃Η𐋏H𝕳Нℌ", "I": "ι𝝸Ⅰiᛁꭵاӏ𝚤Ι𰰠𝐢𝑖І𝕚𝚒lᎥ˛𐐠⍳𝜄I𝗂ιіꙇⅰ𝛊ɪ𝖎ī𝙞iͺ𝒾𝓲ɩℹ𝔦𝗶𑣃𝜾𝞲ⅈı𝘪I𝒊", "J": "𝔍𝓙Ꭻ𝕁ᴊJ𝒥𝙹𝑱Ϳ𝙅յ𝐽Jꭻ𝗝𝕵Јᒍ𝐉ꓙ𝖩Ʝ𝘑", "K": "𝙆𝛫𝑲𝐾𝕂𝒦𝞙𝓚𝖪𝘒К𝝟ᛕꓗ𝙺𐔘KK𝚱𝔎𝐊𝗞Ⲕ𝕶ᏦΚ𝜥K", "L": "ι𐑃LⳐ𝕃𝙇𑢣L𝐋𝓛𝔏𝕷𝈪l𐐛𝙻𝐿ⳑʟⅬ𝑳𐔦ꓡ𝘓Ꮮᒪℒ𝖫ꮮ𖼖ⅼ𝗟𑢲", "M": "𝜧Ꮇℳ𝛭𝝡Μ𝞛𝑀𝗠𝙼𝔐Ϻ𝚳𐊰𝐌𝙈Ⅿ𝘔ᗰМ𝖬𐌑𝓜MꓟᛖⲘ𝑴𝕸M𝕄", "N": "𐔓𝑵𝝢𝙽N𝚴𝒩𝞜𝙉𝕹ℕ𝐍Ⲛ𝛮𝘕𝔑𝖭𝜨Nɴ𝗡ꓠ𝑁𝓝Ν", "O": "οΟoՕО0𱠠OoOо𐐠", "P": "𝙋𝑷𝜬Ꮲ𝞠ꮲ𝚸ℙ𝘗𝙿РᑭΡꓑ𝐏𝝦ᴩ𝓟𐊕𝖯𝛲ⲢᴘPP𝔓𝑃𝒫𝗣𝕻", "Q": "Qℚ𝖰𝙌𝚀𝗤𝘘𝐐ႳႭ𝑄ⵕ𝕼𝑸𝒬𝔔𝓠Q", "R": "ꭱ𐒴𝘙R𝑅ℝꮢᖇℛᚱℜ𝚁𝐑ƦR𝈖ꓣ𝕽𝙍𖼵Ꭱ𝖱𝗥𝑹Ꮢʀ𝓡", "S": "Ꮥ𵠠𝚂𝗦𝖲ႽЅ𝐒𝑺𝕊S𝕾𝑆𖼺𐊖S𝙎𝒮𝓢ꓢss𝘚ᏚՏѕ𝔖𐐠", "T": "⟙𝞣𑢼Ꭲ𐊗𝞽Τтᴛ𝐓𝒯𝙏𝖳𝜏Ⲧτ𝕋ꭲ𝑻𝗧𝑇T𐊱𐌕𖼊T𝛕𝚻𝝉⊤Т𝔗ꓔ𝜯𝝩𝘛𝛵🝨𝞃𝕿𝓣𝚃", "U": "𖽂𝔘𝓤𐓎𝚄ՍUU𝗨𝑼Ա𝙐⋃u𑢸𝑈μ𝖀υ𝐔ሀ∪𝕌𝖴ꓴᑌ𝒰𝘜", "V": "𝙑𝑽ꓦᏙ𝚅ѴⅤ𝐕𝔙𖼈V𝕍ꛟ𝖁𝗩𝘝𝈍V۷٧𑢠𐔝ⴸ𝖵𝑉𝓥𝒱ᐯ", "W": "𝘞𝗪𝖂𝐖𑣦Ԝ𝖶W𝓦𝕎wꓪW𝙒wᏔ𝚆𝑊𝔚𑣯𝒲𝑾Ꮃ", "X": "x𝕏𐌗𐊴Ꭓ𝒳𝛸X𝖃𝜲𝔛ꓫ𑣬𝘟𝓧Ⅹ𝐗𝞦Χ𝚇𐔧𝑋╳𐊐𝚾𝗫ᚷ𝙓XⲬⵝ𝝬𝑿χ𐌢Х𝖷᙭", "Y": "Ꭹ𝝪𝒀ʏy𝖄Ү𝐘ϒ𝒴γ𖽃𝙔𝚼𝔜𑢤Ꮍ𝚈𝞤ꓬy𝓨у𝘠𝛶YY𝑌УⲨ𝗬𐊲𝜰Υ𝖸𝕐", "Z": "𝚉𝐙𝒵𝙕ℨℤ𝘡𝞕𝒁𝖹Ꮓ𝛧𝓩𑢩Ζ𝜡𐋵𝖅ꓜZ𝝛𝗭𝑍𑣥𝚭Z", "a": "𝒶ã⍺α𝜶𝛼ǎɑâ𝖆𝖺𝑎а𝐚𝛂𝗮aáạä𝓪àăåȧa𝒂𝞪𝕒𝔞𝚊𝝰ą𝙖𝘢", "b": "𝗯𝖇ЬḇƅᏏᖯḅ𝓫𝕓d𝑏ḃlɓ𝘣𝙗Ƅ𝐛b𝒃𝒷𝖻𝔟b𝚋ʙ", "c": "𐐽ᴄⲥ𝖼𝘤𝒸𝙘𝒄𝓬ꮯᏟϲ𝐜с𝕔𝗰Ⅽ𝔠𺀠c𝚌𝑐𝖈ⅽc𐐠", "d": "𝗱ꓒ𝙙𝕕ԁᏧ𝒹ɗ𝖽𝘥ḏďd𝒅dɖl𝚍ᑯⅾ𝓭𝐝ḓ𝑑ժḑḋ𝔡đcḍb𝖉ⅆ", "e": "ꬲ𝖊𝕖𝚎℮êė𝔢ⅇȩҽ𝖾ē𝒆ḛĕ𝑒ɇ𝓮ẹℯ𝙚ę𝘦ée𝐞ëèеěce𝗲", "f": "𝔣𝙛ꞙ𝒻𝚏ƒf𝑓𝗳ẝf𝕗𝒇𝟋𝓯ք𝘧ꬵſ𝖿𝐟𝖋ϝḟ", "g": "ɡᶃ𝗴𝔤ɢǧ𝐠g𝘨qģ𝕘gնցġℊ𝗀ĝ𝒈ǥ𝚐ƍ𝓰𝙜𝑔𝖌ğǵ", "h": "ħȟհᏂⱨ𝚑ẖһ𝔥𝒽lḥḩ𝖍ℎ𝕙𝘩𝗁𝐡ɦ𝒉𝗵hhĥḧ𝓱𝙝ḣḫ", "i": "ι𝝸Ⅰiᛁɨꭵاӏ𝚤𰰠𝐢𝑖𝕚𝚒1lȋᎥ˛𐐠⍳𝜄𝗂ιіꙇⅰ𝛊ɪ𝖎ỉīĭ𝙞iͺ𝒾𝓲íɩℹ𝔦𝗶𑣃𝜾𝞲ịǐïⅈı𝘪Iì𝒊", "j": "𝔧𝚓jϳ𝗷𝐣𝙟𝒋𝗃յ𝒿𝑗𝖏ɉ𝘫ʝјⅉ𝕛j𝓳", "k": "𝐤𝑘𝗄𝗸𝚔ḳḵ𝓴𝓀kκⱪ𝕜k𝔨𝖐𝒌ķ𝙠𝘬", "m": "ᴍmmṁⅿḿṃɱrn", "n": "𝓃n𝚗ñr𝐧𝒏ռ𝙣mꞑ𝗇𝘯ṅńņ𝓷𝗻ǹɴnṇň𝑛ṉո𝕟𝖓𝔫", "p": "ƥ𝗉ṗᏢ𝝆ṕ𝒑𝛒𝕡𝔭p𝚙ρ𝝔⍴𝜌𝞀𝓹ƿϱⲣ𝑝P𝖕𝞺p𝓅𝐩р𝙥𝞎𝘱𝛠𝗽𝟈𝜚", "q": "𝐪𝖖g𝑞գqʠq𝘲Ⴍ𝚚ԛ𝕢𝓆𝔮𝗾𝒒𝗊Ⴓ𝙦զ𝓺", "r": "𝓇𝐫ṛrᴦꭈ𝑟ɼṙṟ𝘳ꭇ𝗿ȑ𝗋Իгɾŕɍȓ𝔯ⲅŗr𝒓ř𝙧ʀɽ𝚛ꮁ𝖗𝕣𝓻", "s": "𵠠ꜱ𑣁𝘀ႽЅṣƽ𝓼ŝṡ𐑈Sʂ𝑠𐐠ś𝙨𝓈S𝖘ss𝕤Ꮪ𝐬𝔰𝗌𝚜ѕ𝒔ꮪ𝘴șšՏ", "t": "𝐭𝒕𝑡ṫᎢ𝖙ț𝘁𝓽ƫτ𝔱ţ𝙩t𝓉𝗍𝘵𝚝ṭt𝕥ŧ", "u": "𝕦𑣘ůūǔùUꭎuՍUųűư𝗎ꞟʉսûԱ𝖚𝐮𝞄𝘂𝘶𝛖𐓶𝜐ú⋃uũȗụ𝒖𝓊𝔲üυμ𝝊ʋ𝑢ŭȕ𝞾𝓾𝙪𝚞ᴜꭒ", "v": "𝒗⋁𝚟𑣀𝗏ѵѴ𝜈𝞶𝑣𝓋𝐯𝔳𝘃v𝝼vⱱνטⱴ𝖛ᴠ𝘷∨ⅴ𝙫𝕧ṽꮩ𝓿ṿᶌ𑜆𝛎𝝂", "w": "𑜊ẅ𝑤𑜎𝘄ẘ𝖜ɯ𝒘𝔀W𝗐𝘸vw𝚠ẇẁ𝐰ẉWwẃ𝔴ԝꮃ𝕨աⱳ𑜏𝙬Ꮃŵᴡ𝓌ѡ", "x": "x𝐱⤬𝘅𝙭𝓍𝔁ᕽⅩᕁ𝗑𝖝𝑥᙮х𝚡𝔵×⤫ⅹχ𝒙𝘹x⨯𝕩", "y": "𝛄𝕪𝓎ʏɣ𝗒y𝒚Үŷγƴ𝚢ỿ𝛾𑣜ℽ𝞬ɏꭚẏ𝔂𝔶ყ𝝲ỵ𝘆ү𝖞ȳyýÿу𝘺𝙮𝑦YYᶌΥ𝐲𝜸", "z": "𝖟𝕫𝙯ꮓź𝘻zᏃ𝗓𝔃𝘇ʐƶż𝐳ⱬẕ𝓏𝒛ᴢẓ𝑧𝚣𑣄𝔷z" } homoglyph_set = {} for name, glyphs in homoglyph_strings.items(): homoglyph_set[name] = list(set(glyphs)) return homoglyph_set # Command Line Functions below this point def set_logging(verbose=False, debug=False): if debug == True: log.setLevel("DEBUG") elif verbose == True: log.setLevel("INFO") def parse_arguments(): parser = argparse.ArgumentParser("Get a summary of some text") parser.add_argument("--verbose", "-v", help="Turn verbosity on", action='store_true') parser.add_argument("--debug", "-d", help="Turn debugging on", action='store_true') parser.add_argument("--string", "-s", help="string to transform into regex", required=True) parser.add_argument("--puny_only", "-p", help="use only puny compliant chars", action='store_true') args = parser.parse_args() return args if __name__ == '__main__': main()