#!/usr/bin/env python # -*- coding: utf-8 -*- """ Delete all comments from Stata file WARNINGS -------- Does not parse `#delimit ;` `/*/*` and similar constructs are not parsed correctly. Note that as of Stata 14, the parsing behavior of /*/ changed. `*/*` still ends a block and starts a new one, however. Usage ----- From CLI: $ python deleteStataComments.py /path/to/file.do From Python: >>> from deleteStataComments import deleteComments >>> doCode = deleteComments(open('/path/to/file.do', 'r').read()) """ from os import linesep import regex import sys StataComment = { 'multiNested': regex.compile( ( r'(?\s*//.*|(?!\B"[^"]*)\s+//(?![^"]*"\B).*?|".*?".*?)' r'|' r'(?/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z))' , flags = regex.VERBOSE + regex.MULTILINE ), 'inline': [ regex.compile( r'^(?\s*)//[^/].*$', flags = regex.VERBOSE + regex.MULTILINE ), regex.compile( r'^(?\s*)///[\s\S]*?^', flags = regex.VERBOSE + regex.MULTILINE ), regex.compile( r'(?!\B"[^"]*)(?\s+)//($|[^/])(?![^"{0}]*"\B).*?$'.format(linesep), flags = regex.VERBOSE + regex.MULTILINE ), regex.compile( r'(?!\B"[^"]*)(?\s+)///(?![^"{0}]*"\B)[\s\S]*?^'.format(linesep), flags = regex.VERBOSE + regex.MULTILINE ) ], 'linestar': regex.compile( r'^\s*\*[\s\S]*?(^|\Z)', flags = regex.VERBOSE + regex.MULTILINE ) } StataMata = regex.compile( r"(?.*?)" r"(?" r"(^\s*" r"(\s*(cap(t(u(re?)?)?)?|n(o(i(s(i(ly?)?)?)?)?)?|qui(e(t(ly?)?)?)?)(\s+:?|:?\s+))*" r"mata\s*:?\s*$" r")" r".*?(\s*end.*?$|\Z)|\Z" r")" , flags = regex.VERBOSE + regex.DOTALL + regex.MULTILINE ) def main(): if len(sys.argv) > 1: doFile = sys.argv[1] with open(doFile, 'r') as doHandle: print(deleteComments(doHandle.read())) def deleteComments(doStr): doStr = StataComment['multiNested'].sub( '', doStr ) doStr = StataComment['multiNestedEscape'].sub( deleteCStyle, doStr ) for regexp in StataComment['inline']: doStr = regexp.sub( '\g', doStr ) doStr = StataMata.sub(deleteLineStar, doStr) return doStr def deleteCStyle(match): if match.groupdict()['ignore']: return match.groupdict()['ignore'] elif match.groupdict()['delete']: return '' def deleteLineStar(match): stata, mata = list(match.groupdict().values()) rstr = "" if stata: rstr += StataComment['linestar'].sub('', stata) if mata: rstr += mata return rstr if __name__ == "__main__": main()