Created
November 1, 2025 10:48
-
-
Save xflr6/591a22e15b35fae418835f23584c35e4 to your computer and use it in GitHub Desktop.
Compare different Python Standard Library methods to read an UTF-8 encoded text file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Compare different stdlib methods to read an UTF-8 encoded text file.""" | |
| import codecs | |
| import io | |
| import os | |
| import pathlib | |
| import time | |
| TEST_FILE = pathlib.Path('read_text.txt') | |
| ENCODING = 'utf-8' | |
| LINE = '\u201eFix, Schwyz!\u201c, qu\xe4kt J\xfcrgen bl\xf6d vom Pa\xdf' | |
| N_LINES = 1_000_000 | |
| def setup_test_file(path: os.PathLike[str] | str = TEST_FILE, /, *, | |
| n_lines: int = N_LINES, | |
| encoding: str = ENCODING) -> pathlib.Path: | |
| if not isinstance(path, pathlib.Path): | |
| path = pathlib.Path(path) | |
| if not path.exists(): | |
| with path.open('w', encoding=encoding) as f: | |
| for _ in range(n_lines): | |
| print(LINE, file=f) | |
| assert path.exists() | |
| expected_size = len((LINE + os.linesep).encode(encoding)) * n_lines | |
| size = path.stat().st_size | |
| assert size == expected_size, f'{size=} should be {expected_size}' | |
| return path | |
| def read_text_open(path: os.PathLike[str] | str, /, *, | |
| encoding: str = ENCODING) -> str: | |
| with open(path, encoding=encoding) as f: | |
| return f.read() | |
| def read_text_pathlib(path: os.PathLike[str] | str, /, *, | |
| encoding: str = ENCODING) -> str: | |
| return pathlib.Path(path).read_text(encoding=encoding) | |
| def read_text_io(path: os.PathLike[str] | str, /, *, | |
| encoding: str = ENCODING) -> str: | |
| with io.TextIOWrapper(open(path, mode='rb'), encoding) as f: | |
| result = f.read() | |
| assert f.closed | |
| return result | |
| def read_text_codecs(path: os.PathLike[str] | str, /, *, | |
| encoding: str = ENCODING) -> str: | |
| with codecs.getreader(encoding)(open(path, mode='rb')) as f: | |
| result = f.read() | |
| assert f.closed | |
| return result | |
| if __name__ == '__main__': | |
| path = setup_test_file() | |
| for read_func in [read_text_open, | |
| read_text_pathlib, | |
| read_text_io, | |
| read_text_codecs]: | |
| print(read_func, end=' ') | |
| start = time.perf_counter_ns() | |
| result = read_func(path) | |
| duration = (time.perf_counter_ns() - start) / 1_000_000_000 | |
| print(duration, 'seconds') | |
| assert isinstance(result, str) | |
| # stdlib codecs module lacks universal newline support (PEP 278) | |
| linesep = os.linesep if read_func is read_text_codecs else '\n' | |
| expected = len(LINE + linesep) * N_LINES | |
| assert len(result) == expected, f'{len(result)=} should be {expected}' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment