gpfreitas · March 21, 2024 22:22 · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024 · Apr 26, 2016
diff --git a/parse_import_dis.py b/parse_import_dis.py
@@ -451,9 +451,9 @@ def show_imported_modules(files: list[str], include_builtin_and_stdlib: bool = F
     top_level_modules = {".".join(_.split(".")[: args.maxdepth]) for _ in used_modules}
 
     if include_builtin_and_stdlib:
-        final_module_list = sorted(list(top_level_modules))
+        final_module_list = sorted(top_level_modules)
     else:
-        final_module_list = sorted(list(restrict_to_third_party(top_level_modules)))
+        final_module_list = sorted(restrict_to_third_party(top_level_modules))
 
     return final_module_list
 

diff --git a/parse_import_dis.py b/parse_import_dis.py
@@ -427,6 +427,37 @@ def restrict_to_third_party(modules):
     return thirdparty
 
 
+def show_imported_modules(files: list[str], include_builtin_and_stdlib: bool = False):
+    """Show a list of modules imported by `files`.
+
+    Args:
+        files: a list of file paths (as strings)
+        include_builtin_and_stdlib: set to True iff you want to include stdlib and builtin modules. If False, only
+            third-party modules are shown.
+
+    Return:
+        final_modules_list: a list of the imported modules.
+
+    """
+    used_modules = set()
+
+    for module_path in files:
+        try:
+            logger.debug(f"Trying to parse {module_path}")
+            this_module_deps = module_dependencies(module_path)
+            used_modules.update(this_module_deps)
+        except ModuleSyntaxError:
+            logging.error("Problem processing module {}".format(module_path))
+    top_level_modules = {".".join(_.split(".")[: args.maxdepth]) for _ in used_modules}
+
+    if include_builtin_and_stdlib:
+        final_module_list = sorted(list(top_level_modules))
+    else:
+        final_module_list = sorted(list(restrict_to_third_party(top_level_modules)))
+
+    return final_module_list
+
+
 def cli():
     parser = argparse.ArgumentParser(
         prog="parse_imports",
@@ -447,26 +478,19 @@ def cli():
         default=1,
         help="1 lists top-level modules, 2 lists modules and submodules, etc.",
     )
+    parser.add_argument(
+        "--loglevel",
+        default="INFO",
+        help="Select the log level: one of CRITICAL, ERROR, WARN, INFO, DEBUG",
+    )
 
     return parser.parse_args()
 
 
 if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
     args = cli()
-    used_modules = set()
-
-    for module_path in args.files:
-        try:
-            this_module_deps = module_dependencies(module_path)
-            used_modules.update(this_module_deps)
-        except ModuleSyntaxError:
-            logging.error("Problem processing module {}".format(module_path))
-    top_level_modules = {".".join(_.split(".")[: args.maxdepth]) for _ in used_modules}
-
-    if args.all:
-        final_module_list = sorted(list(top_level_modules))
-    else:
-        final_module_list = sorted(list(restrict_to_third_party(top_level_modules)))
-
+    loglevel = getattr(logging, args.loglevel)
+    logging.basicConfig(level=loglevel)
+    final_module_list = show_imported_modules(files=args.files, include_builtin_and_stdlib=args.all)
     print("\n".join(final_module_list))
+    logger.debug("DONE")
diff --git a/parse_import_dis.py b/parse_import_dis.py
@@ -1,4 +1,3 @@
-
 """
 This script takes as input a list of Python source files and outputs the
 top-level modules that are imported in those source files.
@@ -8,39 +7,361 @@
 avoid any harmful side-effects of executing untrusted code.
 """
 
+import argparse
 import dis
 import sys
 import io
 import logging
 from tokenize import tokenize, untokenize, ENCODING, NAME, NEWLINE, NL
 from itertools import takewhile, dropwhile, chain
 
+logger = logging.getLogger(__name__)
+
+
+_PY310_STDLIB_AND_BUILTIN_MODULES = [
+    "threading",
+    "tracemalloc",
+    "termios",
+    "_posixshmem",
+    "_decimal",
+    "opcode",
+    "ftplib",
+    "pty",
+    "socketserver",
+    "fractions",
+    "sys",
+    "struct",
+    "decimal",
+    "_py_abc",
+    "pwd",
+    "getpass",
+    "poplib",
+    "_pyio",
+    "__future__",
+    "_frozen_importlib",
+    "_queue",
+    "glob",
+    "json",
+    "socket",
+    "posix",
+    "timeit",
+    "functools",
+    "tkinter",
+    "sunau",
+    "_opcode",
+    "pipes",
+    "cmd",
+    "io",
+    "_tracemalloc",
+    "numbers",
+    "lzma",
+    "_locale",
+    "csv",
+    "grp",
+    "cProfile",
+    "gzip",
+    "uu",
+    "inspect",
+    "_ast",
+    "doctest",
+    "reprlib",
+    "_threading_local",
+    "tabnanny",
+    "sysconfig",
+    "_codecs_cn",
+    "_uuid",
+    "binhex",
+    "_multiprocessing",
+    "shelve",
+    "fileinput",
+    "secrets",
+    "pyclbr",
+    "_overlapped",
+    "_scproxy",
+    "zlib",
+    "runpy",
+    "_symtable",
+    "_elementtree",
+    "nntplib",
+    "traceback",
+    "quopri",
+    "turtledemo",
+    "code",
+    "turtle",
+    "chunk",
+    "readline",
+    "multiprocessing",
+    "winsound",
+    "_csv",
+    "audioop",
+    "_dbm",
+    "hashlib",
+    "_posixsubprocess",
+    "_sha3",
+    "tokenize",
+    "_thread",
+    "select",
+    "http",
+    "tarfile",
+    "plistlib",
+    "pkgutil",
+    "_blake2",
+    "_codecs_tw",
+    "argparse",
+    "gc",
+    "venv",
+    "sched",
+    "_lzma",
+    "os",
+    "_sha512",
+    "types",
+    "_tkinter",
+    "crypt",
+    "dis",
+    "sre_constants",
+    "queue",
+    "imghdr",
+    "curses",
+    "linecache",
+    "_crypt",
+    "_pydecimal",
+    "sndhdr",
+    "shutil",
+    "resource",
+    "xmlrpc",
+    "_curses",
+    "_sitebuiltins",
+    "_codecs_kr",
+    "_weakrefset",
+    "pickletools",
+    "ntpath",
+    "math",
+    "trace",
+    "ensurepip",
+    "_socket",
+    "unittest",
+    "gettext",
+    "builtins",
+    "mimetypes",
+    "_curses_panel",
+    "sre_compile",
+    "_markupbase",
+    "contextlib",
+    "smtplib",
+    "subprocess",
+    "nt",
+    "_sre",
+    "zoneinfo",
+    "string",
+    "errno",
+    "array",
+    "symtable",
+    "_sqlite3",
+    "telnetlib",
+    "stat",
+    "dataclasses",
+    "_aix_support",
+    "_functools",
+    "_codecs_jp",
+    "tempfile",
+    "zipfile",
+    "nturl2path",
+    "site",
+    "pydoc",
+    "_compat_pickle",
+    "ipaddress",
+    "rlcompleter",
+    "_msi",
+    "_compression",
+    "_gdbm",
+    "spwd",
+    "ssl",
+    "cgi",
+    "_strptime",
+    "fnmatch",
+    "compileall",
+    "idlelib",
+    "fcntl",
+    "_random",
+    "bdb",
+    "_codecs_hk",
+    "ctypes",
+    "unicodedata",
+    "asyncio",
+    "collections",
+    "_lsprof",
+    "_bisect",
+    "marshal",
+    "nis",
+    "typing",
+    "cgitb",
+    "logging",
+    "_codecs_iso2022",
+    "codeop",
+    "imaplib",
+    "base64",
+    "re",
+    "antigravity",
+    "statistics",
+    "mailbox",
+    "itertools",
+    "optparse",
+    "warnings",
+    "msvcrt",
+    "_sha1",
+    "pydoc_data",
+    "bisect",
+    "atexit",
+    "abc",
+    "calendar",
+    "configparser",
+    "_collections_abc",
+    "_bootsubprocess",
+    "asyncore",
+    "copy",
+    "_zoneinfo",
+    "posixpath",
+    "asynchat",
+    "_asyncio",
+    "_codecs",
+    "pstats",
+    "shlex",
+    "this",
+    "email",
+    "_hashlib",
+    "enum",
+    "smtpd",
+    "_ctypes",
+    "_signal",
+    "ast",
+    "_operator",
+    "xml",
+    "platform",
+    "xxsubtype",
+    "binascii",
+    "random",
+    "stringprep",
+    "heapq",
+    "keyword",
+    "mailcap",
+    "ossaudiodev",
+    "sre_parse",
+    "contextvars",
+    "imp",
+    "textwrap",
+    "mmap",
+    "winreg",
+    "_datetime",
+    "_weakref",
+    "_imp",
+    "netrc",
+    "zipimport",
+    "_ssl",
+    "hmac",
+    "signal",
+    "datetime",
+    "distutils",
+    "msilib",
+    "uuid",
+    "profile",
+    "_sha256",
+    "difflib",
+    "tty",
+    "aifc",
+    "_pickle",
+    "webbrowser",
+    "operator",
+    "encodings",
+    "_bz2",
+    "_winapi",
+    "_json",
+    "_statistics",
+    "sqlite3",
+    "copyreg",
+    "_string",
+    "filecmp",
+    "getopt",
+    "zipapp",
+    "selectors",
+    "wsgiref",
+    "_osx_support",
+    "importlib",
+    "locale",
+    "codecs",
+    "lib2to3",
+    "py_compile",
+    "_io",
+    "modulefinder",
+    "pdb",
+    "syslog",
+    "_md5",
+    "_abc",
+    "pickle",
+    "time",
+    "graphlib",
+    "_collections",
+    "dbm",
+    "genericpath",
+    "xdrlib",
+    "pathlib",
+    "_stat",
+    "_struct",
+    "_frozen_importlib_external",
+    "concurrent",
+    "urllib",
+    "faulthandler",
+    "bz2",
+    "_warnings",
+    "colorsys",
+    "token",
+    "_heapq",
+    "_multibytecodec",
+    "html",
+    "wave",
+    "pprint",
+    "pyexpat",
+    "weakref",
+    "_contextvars",
+    "cmath",
+]
+
+
 class BaseParseImportsError(Exception):
     pass
 
+
 class ModuleSyntaxError(BaseParseImportsError):
     pass
 
+
 def tokenize_source(source):
     """Maps a string of Python source into an iterable of tokens.
 
     Note that your source can have syntax errors.
     """
-    tokens = tokenize(io.BytesIO(source.encode('utf-8')).readline)
+    tokens = tokenize(io.BytesIO(source.encode("utf-8")).readline)
     return tokens
 
+
 def is_not_physical_newline_token(token):
     "tokenize.TokenInfo -> True iff physical newline token."
     return token.type != NL
+
+
 def is_not_logical_newline_token(token):
     "tokenize.TokenInfo -> True iff logical newline token."
-    return token.type != NEWLINE 
+    return token.type != NEWLINE
+
+
 def is_not_import_token(token):
     "tokenize.TokenInfo -> True iff not the beginning of an import statement."
-    import_token_conditions = [(token.type == NAME and token.string == 'import'),
-                               (token.type == NAME and token.string == 'from')]
+    import_token_conditions = [
+        (token.type == NAME and token.string == "import"),
+        (token.type == NAME and token.string == "from"),
+    ]
     return not any(import_token_conditions)
 
+
 def extract_import_logical_lines(source):
     "Filters out logical lines from source that are not import statements."
     tokens = tokenize_source(source)
@@ -50,8 +371,8 @@ def extract_import_logical_lines(source):
         tokens = chain([tok], tokens)
         start_import = dropwhile(is_not_import_token, tokens)
         import_tokens = takewhile(is_not_logical_newline_token, start_import)
-        
-        import_statement = untokenize(import_tokens) # a single logical line
+
+        import_statement = untokenize(import_tokens)  # a single logical line
         # For some reason the output of untokenize above contains various lines
         # with "\", the forwardslash character, used in Python for explicit
         # linebreaks. I think we obtain one "\" per line of source that we
@@ -66,19 +387,21 @@ def extract_import_logical_lines(source):
             continue
         yield import_statement
 
+
 def imported_modules(import_statements):
     "Maps sequence of import statements into set of imported modules."
     imports = set()
     for imp_statement in import_statements:
         try:
             instructions = dis.get_instructions(imp_statement)
-            new_imports = {__.argval for __ in instructions if __.opname == 'IMPORT_NAME'}
+            new_imports = {__.argval for __ in instructions if __.opname == "IMPORT_NAME"}
             if new_imports:
                 imports.update(new_imports)
-        except SyntaxError as e:
+        except SyntaxError:
             raise ModuleSyntaxError
     return imports
 
+
 def module_dependencies(module_path):
     "Maps the path of a module into the set of modules it imports."
     with open(module_path) as f:
@@ -88,19 +411,62 @@ def module_dependencies(module_path):
     return imp_modules
 
 
+def restrict_to_third_party(modules):
+    """Remove all modules that are stdlib or builtin from the sequence `modules`"""
+    if sys.version_info >= (3, 10):
+        builtin_and_stdlib = set(sys.builtin_module_names).union(set(sys.stdlib_module_names))
+    else:
+        logger.warning("Python version earlier than 3.10, assuming a list of builtin/stdlib modules from Python 3.10")
+        builtin_and_stdlib = set(_PY310_STDLIB_AND_BUILTIN_MODULES)
+
+    thirdparty = {m for m in modules if m.split(".")[0] not in builtin_and_stdlib}
+
+    # if modules is just top-level modules, we are done. But modules may have module.submodule items. We also remove
+    # those.
+
+    return thirdparty
+
+
+def cli():
+    parser = argparse.ArgumentParser(
+        prog="parse_imports",
+        description="Shows python modules imported in a list of files",
+        epilog="",
+    )
+    parser.add_argument("files", action="store", nargs="+", help="List of files to scan for modules")
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        default=False,
+        help="List builtin and stdlib modules also",
+    )  # on/off flag
+    parser.add_argument(
+        "--maxdepth",
+        action="store",
+        type=int,
+        default=1,
+        help="1 lists top-level modules, 2 lists modules and submodules, etc.",
+    )
+
+    return parser.parse_args()
+
+
 if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    args = cli()
     used_modules = set()
-    if sys.argv[1].isdigit():
-        maxdepth = int(sys.argv[1])
-        first_module_pos = 2
-    else:
-        maxdepth = 2
-        first_module_pos = 1
-    for module_path in sys.argv[first_module_pos:]:
+
+    for module_path in args.files:
         try:
             this_module_deps = module_dependencies(module_path)
             used_modules.update(this_module_deps)
-        except ModuleSyntaxError as e:
+        except ModuleSyntaxError:
             logging.error("Problem processing module {}".format(module_path))
-    top_level_modules = {'.'.join(_.split('.')[:maxdepth]) for _ in used_modules}
-    print('\n'.join(sorted(list(top_level_modules))))
+    top_level_modules = {".".join(_.split(".")[: args.maxdepth]) for _ in used_modules}
+
+    if args.all:
+        final_module_list = sorted(list(top_level_modules))
+    else:
+        final_module_list = sorted(list(restrict_to_third_party(top_level_modules)))
+
+    print("\n".join(final_module_list))
diff --git a/parse_import_dis.py b/parse_import_dis.py
@@ -90,11 +90,17 @@ def module_dependencies(module_path):
 
 if __name__ == "__main__":
     used_modules = set()
-    for module_path in sys.argv[1:]:
+    if sys.argv[1].isdigit():
+        maxdepth = int(sys.argv[1])
+        first_module_pos = 2
+    else:
+        maxdepth = 2
+        first_module_pos = 1
+    for module_path in sys.argv[first_module_pos:]:
         try:
             this_module_deps = module_dependencies(module_path)
             used_modules.update(this_module_deps)
         except ModuleSyntaxError as e:
             logging.error("Problem processing module {}".format(module_path))
-    top_level_modules = {_.split('.')[0] for _ in used_modules}
-    print('\n'.join(top_level_modules))
+    top_level_modules = {'.'.join(_.split('.')[:maxdepth]) for _ in used_modules}
+    print('\n'.join(sorted(list(top_level_modules))))
diff --git a/parse_import_dis.py b/parse_import_dis.py
@@ -0,0 +1,100 @@
+
+"""
+This script takes as input a list of Python source files and outputs the
+top-level modules that are imported in those source files.
+
+The script does this without executing any code. This is useful when you have
+exercise code (that often has syntax errors / missing code) or if you want to
+avoid any harmful side-effects of executing untrusted code.
+"""
+
+import dis
+import sys
+import io
+import logging
+from tokenize import tokenize, untokenize, ENCODING, NAME, NEWLINE, NL
+from itertools import takewhile, dropwhile, chain
+
+class BaseParseImportsError(Exception):
+    pass
+
+class ModuleSyntaxError(BaseParseImportsError):
+    pass
+
+def tokenize_source(source):
+    """Maps a string of Python source into an iterable of tokens.
+
+    Note that your source can have syntax errors.
+    """
+    tokens = tokenize(io.BytesIO(source.encode('utf-8')).readline)
+    return tokens
+
+def is_not_physical_newline_token(token):
+    "tokenize.TokenInfo -> True iff physical newline token."
+    return token.type != NL
+def is_not_logical_newline_token(token):
+    "tokenize.TokenInfo -> True iff logical newline token."
+    return token.type != NEWLINE 
+def is_not_import_token(token):
+    "tokenize.TokenInfo -> True iff not the beginning of an import statement."
+    import_token_conditions = [(token.type == NAME and token.string == 'import'),
+                               (token.type == NAME and token.string == 'from')]
+    return not any(import_token_conditions)
+
+def extract_import_logical_lines(source):
+    "Filters out logical lines from source that are not import statements."
+    tokens = tokenize_source(source)
+    encoding_token = next(tokens)
+    assert encoding_token.type == ENCODING
+    for tok in tokens:
+        tokens = chain([tok], tokens)
+        start_import = dropwhile(is_not_import_token, tokens)
+        import_tokens = takewhile(is_not_logical_newline_token, start_import)
+
+        import_statement = untokenize(import_tokens) # a single logical line
+        # For some reason the output of untokenize above contains various lines
+        # with "\", the forwardslash character, used in Python for explicit
+        # linebreaks. I think we obtain one "\" per line of source that we
+        # ignored. I would prefer for those lines to be filtered out, and
+        # I tried accomplishing that by filtering out the tokenize.NL tokens.
+        # That did not work. I could of course eliminate these linebreaks in
+        # the string ``import_statement``, but those line breaks are not a
+        # problem that I can see by running the program, or by reading the
+        # source, so I decided to not eliminate the linebreaks from
+        # ``import_statement``.
+        if not import_statement:
+            continue
+        yield import_statement
+
+def imported_modules(import_statements):
+    "Maps sequence of import statements into set of imported modules."
+    imports = set()
+    for imp_statement in import_statements:
+        try:
+            instructions = dis.get_instructions(imp_statement)
+            new_imports = {__.argval for __ in instructions if __.opname == 'IMPORT_NAME'}
+            if new_imports:
+                imports.update(new_imports)
+        except SyntaxError as e:
+            raise ModuleSyntaxError
+    return imports
+
+def module_dependencies(module_path):
+    "Maps the path of a module into the set of modules it imports."
+    with open(module_path) as f:
+        source = f.read()
+    import_statements = extract_import_logical_lines(source)
+    imp_modules = imported_modules(import_statements)
+    return imp_modules
+
+
+if __name__ == "__main__":
+    used_modules = set()
+    for module_path in sys.argv[1:]:
+        try:
+            this_module_deps = module_dependencies(module_path)
+            used_modules.update(this_module_deps)
+        except ModuleSyntaxError as e:
+            logging.error("Problem processing module {}".format(module_path))
+    top_level_modules = {_.split('.')[0] for _ in used_modules}
+    print('\n'.join(top_level_modules))