Skip to content

Instantly share code, notes, and snippets.

@gpfreitas
Last active March 21, 2024 22:22
Show Gist options
  • Save gpfreitas/e532fc3eaeebd524aa5e to your computer and use it in GitHub Desktop.
Save gpfreitas/e532fc3eaeebd524aa5e to your computer and use it in GitHub Desktop.

Revisions

  1. Guilherme Freitas revised this gist Mar 21, 2024. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions parse_import_dis.py
    Original file line number Diff line number Diff line change
    @@ -451,9 +451,9 @@ def show_imported_modules(files: list[str], include_builtin_and_stdlib: bool = F
    top_level_modules = {".".join(_.split(".")[: args.maxdepth]) for _ in used_modules}

    if include_builtin_and_stdlib:
    final_module_list = sorted(list(top_level_modules))
    final_module_list = sorted(top_level_modules)
    else:
    final_module_list = sorted(list(restrict_to_third_party(top_level_modules)))
    final_module_list = sorted(restrict_to_third_party(top_level_modules))

    return final_module_list

  2. Guilherme Freitas revised this gist Mar 21, 2024. 1 changed file with 40 additions and 16 deletions.
    56 changes: 40 additions & 16 deletions parse_import_dis.py
    Original file line number Diff line number Diff line change
    @@ -427,6 +427,37 @@ def restrict_to_third_party(modules):
    return thirdparty


    def show_imported_modules(files: list[str], include_builtin_and_stdlib: bool = False):
    """Show a list of modules imported by `files`.
    Args:
    files: a list of file paths (as strings)
    include_builtin_and_stdlib: set to True iff you want to include stdlib and builtin modules. If False, only
    third-party modules are shown.
    Return:
    final_modules_list: a list of the imported modules.
    """
    used_modules = set()

    for module_path in files:
    try:
    logger.debug(f"Trying to parse {module_path}")
    this_module_deps = module_dependencies(module_path)
    used_modules.update(this_module_deps)
    except ModuleSyntaxError:
    logging.error("Problem processing module {}".format(module_path))
    top_level_modules = {".".join(_.split(".")[: args.maxdepth]) for _ in used_modules}

    if include_builtin_and_stdlib:
    final_module_list = sorted(list(top_level_modules))
    else:
    final_module_list = sorted(list(restrict_to_third_party(top_level_modules)))

    return final_module_list


    def cli():
    parser = argparse.ArgumentParser(
    prog="parse_imports",
    @@ -447,26 +478,19 @@ def cli():
    default=1,
    help="1 lists top-level modules, 2 lists modules and submodules, etc.",
    )
    parser.add_argument(
    "--loglevel",
    default="INFO",
    help="Select the log level: one of CRITICAL, ERROR, WARN, INFO, DEBUG",
    )

    return parser.parse_args()


    if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    args = cli()
    used_modules = set()

    for module_path in args.files:
    try:
    this_module_deps = module_dependencies(module_path)
    used_modules.update(this_module_deps)
    except ModuleSyntaxError:
    logging.error("Problem processing module {}".format(module_path))
    top_level_modules = {".".join(_.split(".")[: args.maxdepth]) for _ in used_modules}

    if args.all:
    final_module_list = sorted(list(top_level_modules))
    else:
    final_module_list = sorted(list(restrict_to_third_party(top_level_modules)))

    loglevel = getattr(logging, args.loglevel)
    logging.basicConfig(level=loglevel)
    final_module_list = show_imported_modules(files=args.files, include_builtin_and_stdlib=args.all)
    print("\n".join(final_module_list))
    logger.debug("DONE")
  3. Guilherme Freitas revised this gist Mar 21, 2024. 1 changed file with 385 additions and 19 deletions.
    404 changes: 385 additions & 19 deletions parse_import_dis.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,3 @@

    """
    This script takes as input a list of Python source files and outputs the
    top-level modules that are imported in those source files.
    @@ -8,39 +7,361 @@
    avoid any harmful side-effects of executing untrusted code.
    """

    import argparse
    import dis
    import sys
    import io
    import logging
    from tokenize import tokenize, untokenize, ENCODING, NAME, NEWLINE, NL
    from itertools import takewhile, dropwhile, chain

    logger = logging.getLogger(__name__)


    _PY310_STDLIB_AND_BUILTIN_MODULES = [
    "threading",
    "tracemalloc",
    "termios",
    "_posixshmem",
    "_decimal",
    "opcode",
    "ftplib",
    "pty",
    "socketserver",
    "fractions",
    "sys",
    "struct",
    "decimal",
    "_py_abc",
    "pwd",
    "getpass",
    "poplib",
    "_pyio",
    "__future__",
    "_frozen_importlib",
    "_queue",
    "glob",
    "json",
    "socket",
    "posix",
    "timeit",
    "functools",
    "tkinter",
    "sunau",
    "_opcode",
    "pipes",
    "cmd",
    "io",
    "_tracemalloc",
    "numbers",
    "lzma",
    "_locale",
    "csv",
    "grp",
    "cProfile",
    "gzip",
    "uu",
    "inspect",
    "_ast",
    "doctest",
    "reprlib",
    "_threading_local",
    "tabnanny",
    "sysconfig",
    "_codecs_cn",
    "_uuid",
    "binhex",
    "_multiprocessing",
    "shelve",
    "fileinput",
    "secrets",
    "pyclbr",
    "_overlapped",
    "_scproxy",
    "zlib",
    "runpy",
    "_symtable",
    "_elementtree",
    "nntplib",
    "traceback",
    "quopri",
    "turtledemo",
    "code",
    "turtle",
    "chunk",
    "readline",
    "multiprocessing",
    "winsound",
    "_csv",
    "audioop",
    "_dbm",
    "hashlib",
    "_posixsubprocess",
    "_sha3",
    "tokenize",
    "_thread",
    "select",
    "http",
    "tarfile",
    "plistlib",
    "pkgutil",
    "_blake2",
    "_codecs_tw",
    "argparse",
    "gc",
    "venv",
    "sched",
    "_lzma",
    "os",
    "_sha512",
    "types",
    "_tkinter",
    "crypt",
    "dis",
    "sre_constants",
    "queue",
    "imghdr",
    "curses",
    "linecache",
    "_crypt",
    "_pydecimal",
    "sndhdr",
    "shutil",
    "resource",
    "xmlrpc",
    "_curses",
    "_sitebuiltins",
    "_codecs_kr",
    "_weakrefset",
    "pickletools",
    "ntpath",
    "math",
    "trace",
    "ensurepip",
    "_socket",
    "unittest",
    "gettext",
    "builtins",
    "mimetypes",
    "_curses_panel",
    "sre_compile",
    "_markupbase",
    "contextlib",
    "smtplib",
    "subprocess",
    "nt",
    "_sre",
    "zoneinfo",
    "string",
    "errno",
    "array",
    "symtable",
    "_sqlite3",
    "telnetlib",
    "stat",
    "dataclasses",
    "_aix_support",
    "_functools",
    "_codecs_jp",
    "tempfile",
    "zipfile",
    "nturl2path",
    "site",
    "pydoc",
    "_compat_pickle",
    "ipaddress",
    "rlcompleter",
    "_msi",
    "_compression",
    "_gdbm",
    "spwd",
    "ssl",
    "cgi",
    "_strptime",
    "fnmatch",
    "compileall",
    "idlelib",
    "fcntl",
    "_random",
    "bdb",
    "_codecs_hk",
    "ctypes",
    "unicodedata",
    "asyncio",
    "collections",
    "_lsprof",
    "_bisect",
    "marshal",
    "nis",
    "typing",
    "cgitb",
    "logging",
    "_codecs_iso2022",
    "codeop",
    "imaplib",
    "base64",
    "re",
    "antigravity",
    "statistics",
    "mailbox",
    "itertools",
    "optparse",
    "warnings",
    "msvcrt",
    "_sha1",
    "pydoc_data",
    "bisect",
    "atexit",
    "abc",
    "calendar",
    "configparser",
    "_collections_abc",
    "_bootsubprocess",
    "asyncore",
    "copy",
    "_zoneinfo",
    "posixpath",
    "asynchat",
    "_asyncio",
    "_codecs",
    "pstats",
    "shlex",
    "this",
    "email",
    "_hashlib",
    "enum",
    "smtpd",
    "_ctypes",
    "_signal",
    "ast",
    "_operator",
    "xml",
    "platform",
    "xxsubtype",
    "binascii",
    "random",
    "stringprep",
    "heapq",
    "keyword",
    "mailcap",
    "ossaudiodev",
    "sre_parse",
    "contextvars",
    "imp",
    "textwrap",
    "mmap",
    "winreg",
    "_datetime",
    "_weakref",
    "_imp",
    "netrc",
    "zipimport",
    "_ssl",
    "hmac",
    "signal",
    "datetime",
    "distutils",
    "msilib",
    "uuid",
    "profile",
    "_sha256",
    "difflib",
    "tty",
    "aifc",
    "_pickle",
    "webbrowser",
    "operator",
    "encodings",
    "_bz2",
    "_winapi",
    "_json",
    "_statistics",
    "sqlite3",
    "copyreg",
    "_string",
    "filecmp",
    "getopt",
    "zipapp",
    "selectors",
    "wsgiref",
    "_osx_support",
    "importlib",
    "locale",
    "codecs",
    "lib2to3",
    "py_compile",
    "_io",
    "modulefinder",
    "pdb",
    "syslog",
    "_md5",
    "_abc",
    "pickle",
    "time",
    "graphlib",
    "_collections",
    "dbm",
    "genericpath",
    "xdrlib",
    "pathlib",
    "_stat",
    "_struct",
    "_frozen_importlib_external",
    "concurrent",
    "urllib",
    "faulthandler",
    "bz2",
    "_warnings",
    "colorsys",
    "token",
    "_heapq",
    "_multibytecodec",
    "html",
    "wave",
    "pprint",
    "pyexpat",
    "weakref",
    "_contextvars",
    "cmath",
    ]


    class BaseParseImportsError(Exception):
    pass


    class ModuleSyntaxError(BaseParseImportsError):
    pass


    def tokenize_source(source):
    """Maps a string of Python source into an iterable of tokens.
    Note that your source can have syntax errors.
    """
    tokens = tokenize(io.BytesIO(source.encode('utf-8')).readline)
    tokens = tokenize(io.BytesIO(source.encode("utf-8")).readline)
    return tokens


    def is_not_physical_newline_token(token):
    "tokenize.TokenInfo -> True iff physical newline token."
    return token.type != NL


    def is_not_logical_newline_token(token):
    "tokenize.TokenInfo -> True iff logical newline token."
    return token.type != NEWLINE
    return token.type != NEWLINE


    def is_not_import_token(token):
    "tokenize.TokenInfo -> True iff not the beginning of an import statement."
    import_token_conditions = [(token.type == NAME and token.string == 'import'),
    (token.type == NAME and token.string == 'from')]
    import_token_conditions = [
    (token.type == NAME and token.string == "import"),
    (token.type == NAME and token.string == "from"),
    ]
    return not any(import_token_conditions)


    def extract_import_logical_lines(source):
    "Filters out logical lines from source that are not import statements."
    tokens = tokenize_source(source)
    @@ -50,8 +371,8 @@ def extract_import_logical_lines(source):
    tokens = chain([tok], tokens)
    start_import = dropwhile(is_not_import_token, tokens)
    import_tokens = takewhile(is_not_logical_newline_token, start_import)
    import_statement = untokenize(import_tokens) # a single logical line

    import_statement = untokenize(import_tokens) # a single logical line
    # For some reason the output of untokenize above contains various lines
    # with "\", the forwardslash character, used in Python for explicit
    # linebreaks. I think we obtain one "\" per line of source that we
    @@ -66,19 +387,21 @@ def extract_import_logical_lines(source):
    continue
    yield import_statement


    def imported_modules(import_statements):
    "Maps sequence of import statements into set of imported modules."
    imports = set()
    for imp_statement in import_statements:
    try:
    instructions = dis.get_instructions(imp_statement)
    new_imports = {__.argval for __ in instructions if __.opname == 'IMPORT_NAME'}
    new_imports = {__.argval for __ in instructions if __.opname == "IMPORT_NAME"}
    if new_imports:
    imports.update(new_imports)
    except SyntaxError as e:
    except SyntaxError:
    raise ModuleSyntaxError
    return imports


    def module_dependencies(module_path):
    "Maps the path of a module into the set of modules it imports."
    with open(module_path) as f:
    @@ -88,19 +411,62 @@ def module_dependencies(module_path):
    return imp_modules


    def restrict_to_third_party(modules):
    """Remove all modules that are stdlib or builtin from the sequence `modules`"""
    if sys.version_info >= (3, 10):
    builtin_and_stdlib = set(sys.builtin_module_names).union(set(sys.stdlib_module_names))
    else:
    logger.warning("Python version earlier than 3.10, assuming a list of builtin/stdlib modules from Python 3.10")
    builtin_and_stdlib = set(_PY310_STDLIB_AND_BUILTIN_MODULES)

    thirdparty = {m for m in modules if m.split(".")[0] not in builtin_and_stdlib}

    # if modules is just top-level modules, we are done. But modules may have module.submodule items. We also remove
    # those.

    return thirdparty


    def cli():
    parser = argparse.ArgumentParser(
    prog="parse_imports",
    description="Shows python modules imported in a list of files",
    epilog="",
    )
    parser.add_argument("files", action="store", nargs="+", help="List of files to scan for modules")
    parser.add_argument(
    "--all",
    action="store_true",
    default=False,
    help="List builtin and stdlib modules also",
    ) # on/off flag
    parser.add_argument(
    "--maxdepth",
    action="store",
    type=int,
    default=1,
    help="1 lists top-level modules, 2 lists modules and submodules, etc.",
    )

    return parser.parse_args()


    if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    args = cli()
    used_modules = set()
    if sys.argv[1].isdigit():
    maxdepth = int(sys.argv[1])
    first_module_pos = 2
    else:
    maxdepth = 2
    first_module_pos = 1
    for module_path in sys.argv[first_module_pos:]:

    for module_path in args.files:
    try:
    this_module_deps = module_dependencies(module_path)
    used_modules.update(this_module_deps)
    except ModuleSyntaxError as e:
    except ModuleSyntaxError:
    logging.error("Problem processing module {}".format(module_path))
    top_level_modules = {'.'.join(_.split('.')[:maxdepth]) for _ in used_modules}
    print('\n'.join(sorted(list(top_level_modules))))
    top_level_modules = {".".join(_.split(".")[: args.maxdepth]) for _ in used_modules}

    if args.all:
    final_module_list = sorted(list(top_level_modules))
    else:
    final_module_list = sorted(list(restrict_to_third_party(top_level_modules)))

    print("\n".join(final_module_list))
  4. gpfreitas revised this gist Apr 26, 2016. 1 changed file with 9 additions and 3 deletions.
    12 changes: 9 additions & 3 deletions parse_import_dis.py
    Original file line number Diff line number Diff line change
    @@ -90,11 +90,17 @@ def module_dependencies(module_path):

    if __name__ == "__main__":
    used_modules = set()
    for module_path in sys.argv[1:]:
    if sys.argv[1].isdigit():
    maxdepth = int(sys.argv[1])
    first_module_pos = 2
    else:
    maxdepth = 2
    first_module_pos = 1
    for module_path in sys.argv[first_module_pos:]:
    try:
    this_module_deps = module_dependencies(module_path)
    used_modules.update(this_module_deps)
    except ModuleSyntaxError as e:
    logging.error("Problem processing module {}".format(module_path))
    top_level_modules = {_.split('.')[0] for _ in used_modules}
    print('\n'.join(top_level_modules))
    top_level_modules = {'.'.join(_.split('.')[:maxdepth]) for _ in used_modules}
    print('\n'.join(sorted(list(top_level_modules))))
  5. gpfreitas created this gist Feb 21, 2016.
    100 changes: 100 additions & 0 deletions parse_import_dis.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,100 @@

    """
    This script takes as input a list of Python source files and outputs the
    top-level modules that are imported in those source files.
    The script does this without executing any code. This is useful when you have
    exercise code (that often has syntax errors / missing code) or if you want to
    avoid any harmful side-effects of executing untrusted code.
    """

    import dis
    import sys
    import io
    import logging
    from tokenize import tokenize, untokenize, ENCODING, NAME, NEWLINE, NL
    from itertools import takewhile, dropwhile, chain

    class BaseParseImportsError(Exception):
    pass

    class ModuleSyntaxError(BaseParseImportsError):
    pass

    def tokenize_source(source):
    """Maps a string of Python source into an iterable of tokens.
    Note that your source can have syntax errors.
    """
    tokens = tokenize(io.BytesIO(source.encode('utf-8')).readline)
    return tokens

    def is_not_physical_newline_token(token):
    "tokenize.TokenInfo -> True iff physical newline token."
    return token.type != NL
    def is_not_logical_newline_token(token):
    "tokenize.TokenInfo -> True iff logical newline token."
    return token.type != NEWLINE
    def is_not_import_token(token):
    "tokenize.TokenInfo -> True iff not the beginning of an import statement."
    import_token_conditions = [(token.type == NAME and token.string == 'import'),
    (token.type == NAME and token.string == 'from')]
    return not any(import_token_conditions)

    def extract_import_logical_lines(source):
    "Filters out logical lines from source that are not import statements."
    tokens = tokenize_source(source)
    encoding_token = next(tokens)
    assert encoding_token.type == ENCODING
    for tok in tokens:
    tokens = chain([tok], tokens)
    start_import = dropwhile(is_not_import_token, tokens)
    import_tokens = takewhile(is_not_logical_newline_token, start_import)

    import_statement = untokenize(import_tokens) # a single logical line
    # For some reason the output of untokenize above contains various lines
    # with "\", the forwardslash character, used in Python for explicit
    # linebreaks. I think we obtain one "\" per line of source that we
    # ignored. I would prefer for those lines to be filtered out, and
    # I tried accomplishing that by filtering out the tokenize.NL tokens.
    # That did not work. I could of course eliminate these linebreaks in
    # the string ``import_statement``, but those line breaks are not a
    # problem that I can see by running the program, or by reading the
    # source, so I decided to not eliminate the linebreaks from
    # ``import_statement``.
    if not import_statement:
    continue
    yield import_statement

    def imported_modules(import_statements):
    "Maps sequence of import statements into set of imported modules."
    imports = set()
    for imp_statement in import_statements:
    try:
    instructions = dis.get_instructions(imp_statement)
    new_imports = {__.argval for __ in instructions if __.opname == 'IMPORT_NAME'}
    if new_imports:
    imports.update(new_imports)
    except SyntaxError as e:
    raise ModuleSyntaxError
    return imports

    def module_dependencies(module_path):
    "Maps the path of a module into the set of modules it imports."
    with open(module_path) as f:
    source = f.read()
    import_statements = extract_import_logical_lines(source)
    imp_modules = imported_modules(import_statements)
    return imp_modules


    if __name__ == "__main__":
    used_modules = set()
    for module_path in sys.argv[1:]:
    try:
    this_module_deps = module_dependencies(module_path)
    used_modules.update(this_module_deps)
    except ModuleSyntaxError as e:
    logging.error("Problem processing module {}".format(module_path))
    top_level_modules = {_.split('.')[0] for _ in used_modules}
    print('\n'.join(top_level_modules))