Skip to content

Instantly share code, notes, and snippets.

@xflr6
Created November 1, 2025 10:48
Show Gist options
  • Select an option

  • Save xflr6/591a22e15b35fae418835f23584c35e4 to your computer and use it in GitHub Desktop.

Select an option

Save xflr6/591a22e15b35fae418835f23584c35e4 to your computer and use it in GitHub Desktop.
Compare different Python Standard Library methods to read an UTF-8 encoded text file.
"""Compare different stdlib methods to read an UTF-8 encoded text file."""
import codecs
import io
import os
import pathlib
import time
TEST_FILE = pathlib.Path('read_text.txt')
ENCODING = 'utf-8'
LINE = '\u201eFix, Schwyz!\u201c, qu\xe4kt J\xfcrgen bl\xf6d vom Pa\xdf'
N_LINES = 1_000_000
def setup_test_file(path: os.PathLike[str] | str = TEST_FILE, /, *,
n_lines: int = N_LINES,
encoding: str = ENCODING) -> pathlib.Path:
if not isinstance(path, pathlib.Path):
path = pathlib.Path(path)
if not path.exists():
with path.open('w', encoding=encoding) as f:
for _ in range(n_lines):
print(LINE, file=f)
assert path.exists()
expected_size = len((LINE + os.linesep).encode(encoding)) * n_lines
size = path.stat().st_size
assert size == expected_size, f'{size=} should be {expected_size}'
return path
def read_text_open(path: os.PathLike[str] | str, /, *,
encoding: str = ENCODING) -> str:
with open(path, encoding=encoding) as f:
return f.read()
def read_text_pathlib(path: os.PathLike[str] | str, /, *,
encoding: str = ENCODING) -> str:
return pathlib.Path(path).read_text(encoding=encoding)
def read_text_io(path: os.PathLike[str] | str, /, *,
encoding: str = ENCODING) -> str:
with io.TextIOWrapper(open(path, mode='rb'), encoding) as f:
result = f.read()
assert f.closed
return result
def read_text_codecs(path: os.PathLike[str] | str, /, *,
encoding: str = ENCODING) -> str:
with codecs.getreader(encoding)(open(path, mode='rb')) as f:
result = f.read()
assert f.closed
return result
if __name__ == '__main__':
path = setup_test_file()
for read_func in [read_text_open,
read_text_pathlib,
read_text_io,
read_text_codecs]:
print(read_func, end=' ')
start = time.perf_counter_ns()
result = read_func(path)
duration = (time.perf_counter_ns() - start) / 1_000_000_000
print(duration, 'seconds')
assert isinstance(result, str)
# stdlib codecs module lacks universal newline support (PEP 278)
linesep = os.linesep if read_func is read_text_codecs else '\n'
expected = len(LINE + linesep) * N_LINES
assert len(result) == expected, f'{len(result)=} should be {expected}'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment