Skip to content

Instantly share code, notes, and snippets.

@thomwolf
Last active January 12, 2025 13:34
Show Gist options
  • Save thomwolf/13ca2b2b172b2d17ac66685aa2eeba62 to your computer and use it in GitHub Desktop.
Save thomwolf/13ca2b2b172b2d17ac66685aa2eeba62 to your computer and use it in GitHub Desktop.

Revisions

  1. thomwolf revised this gist Oct 29, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion loading_wikipedia.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    import os; import psutil; import timeit
    from nlp import load_dataset
    from datasets import load_dataset

    mem_before = psutil.Process(os.getpid()).memory_info().rss >> 20
    wiki = load_dataset("wikipedia", "20200501.en", split='train')
  2. thomwolf revised this gist Jun 15, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion loading_wikipedia.py
    Original file line number Diff line number Diff line change
    @@ -12,4 +12,4 @@
    """
    time = timeit.timeit(stmt=s, number=1, globals=globals())
    size = wiki.dataset_size / 2**30
    print(f"Iterated over the {size:.1f} GB dataset in {time:.1f} s, i.e. {size/time:.1f} Gbit/s")
    print(f"Iterated over the {size:.1f} GB dataset in {time:.1f} s, i.e. {size * 8/time:.1f} Gbit/s")
  3. thomwolf created this gist Jun 15, 2020.
    15 changes: 15 additions & 0 deletions loading_wikipedia.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,15 @@
    import os; import psutil; import timeit
    from nlp import load_dataset

    mem_before = psutil.Process(os.getpid()).memory_info().rss >> 20
    wiki = load_dataset("wikipedia", "20200501.en", split='train')
    mem_after = psutil.Process(os.getpid()).memory_info().rss >> 20
    print(f"RAM memory used: {(mem_after - mem_before)} MB")

    s = """batch_size = 1000
    for i in range(0, len(wiki), batch_size):
    batch = wiki[i:i + batch_size]
    """
    time = timeit.timeit(stmt=s, number=1, globals=globals())
    size = wiki.dataset_size / 2**30
    print(f"Iterated over the {size:.1f} GB dataset in {time:.1f} s, i.e. {size/time:.1f} Gbit/s")