Skip to content

Instantly share code, notes, and snippets.

@ktaranov
Forked from peterhurford/num_rows_csv.R
Created February 25, 2021 13:17
Show Gist options
  • Select an option

  • Save ktaranov/0ecbd45076640e7df89acbc7ed2b246c to your computer and use it in GitHub Desktop.

Select an option

Save ktaranov/0ecbd45076640e7df89acbc7ed2b246c to your computer and use it in GitHub Desktop.

Revisions

  1. @peterhurford peterhurford revised this gist Jun 15, 2017. 1 changed file with 4 additions and 0 deletions.
    4 changes: 4 additions & 0 deletions num_rows_csv.R
    Original file line number Diff line number Diff line change
    @@ -13,6 +13,10 @@ pryr::object_size(d)
    readr::write_csv(d, "tmp.csv")

    microbenchmark::microbenchmark(
    {lines <- 0; f <- file("tmp.csv", "r"); while (TRUE) {
    line <- readLines(f, n = 1)
    if (length(line) == 0) { break }; lines <- lines + 1}
    print(lines - 1) }, # 2784.9ms
    { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] }, # 2103.0ms
    { length(readLines("tmp.csv")) - 1 }, # 1750.3ms
    { length(count.fields("tmp.csv")) - 1 }, # 1519.1ms
  2. @peterhurford peterhurford revised this gist Mar 4, 2017. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion num_rows_csv.R
    Original file line number Diff line number Diff line change
    @@ -24,5 +24,4 @@ microbenchmark::microbenchmark(
    { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 }, # 254.8ms
    { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 }, # 24.9ms
    { as.numeric(system("cat tmp.csv | wc -l", intern = TRUE)) - 1 }, # 17.9ms

    times = 4)
  3. @peterhurford peterhurford revised this gist Mar 4, 2017. 1 changed file with 8 additions and 7 deletions.
    15 changes: 8 additions & 7 deletions num_rows_csv.R
    Original file line number Diff line number Diff line change
    @@ -13,15 +13,16 @@ pryr::object_size(d)
    readr::write_csv(d, "tmp.csv")

    microbenchmark::microbenchmark(
    { dim(readr::read_csv("tmp.csv"))[[1]] }, # 414.4ms
    { NROW(readr::read_csv("tmp.csv")) }, # 391.7ms
    { dim(data.table::fread("tmp.csv"))[[1]] }, # 493.4ms
    { NROW(data.table::fread("tmp.csv")) }, # 472.7ms
    { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] }, # 2103.0ms
    { length(readLines("tmp.csv")) - 1 }, # 1750.3ms
    { length(count.fields("tmp.csv")) - 1 }, # 1519.1ms
    { R.utils::countLines("tmp.csv")[[1]] - 1 }, # 1071.3ms
    { length(readLines("tmp.csv")) - 1 }, # 1750.3ms
    { dim(data.table::fread("tmp.csv"))[[1]] }, # 493.4ms
    { NROW(data.table::fread("tmp.csv")) }, # 472.7ms
    { dim(readr::read_csv("tmp.csv"))[[1]] }, # 414.4ms
    { NROW(readr::read_csv("tmp.csv")) }, # 391.7ms
    { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 }, # 254.8ms
    { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 }, # 24.9ms
    { as.numeric(system("cat tmp.csv | wc -l", intern = TRUE)) - 1 }, # 17.9ms
    { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 }, # 254.8ms
    { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] }, # 2103.0ms

    times = 4)
  4. @peterhurford peterhurford revised this gist May 20, 2016. 1 changed file with 4 additions and 1 deletion.
    5 changes: 4 additions & 1 deletion num_rows_csv.R
    Original file line number Diff line number Diff line change
    @@ -13,7 +13,10 @@ pryr::object_size(d)
    readr::write_csv(d, "tmp.csv")

    microbenchmark::microbenchmark(
    { dim(readr::read_csv("tmp.csv")) }, # 394.3ms
    { dim(readr::read_csv("tmp.csv"))[[1]] }, # 414.4ms
    { NROW(readr::read_csv("tmp.csv")) }, # 391.7ms
    { dim(data.table::fread("tmp.csv"))[[1]] }, # 493.4ms
    { NROW(data.table::fread("tmp.csv")) }, # 472.7ms
    { length(count.fields("tmp.csv")) - 1 }, # 1519.1ms
    { R.utils::countLines("tmp.csv")[[1]] - 1 }, # 1071.3ms
    { length(readLines("tmp.csv")) - 1 }, # 1750.3ms
  5. @peterhurford peterhurford revised this gist May 19, 2016. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions num_rows_csv.R
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,6 @@
    # What's the fastest way to determine the number of rows of a CSV in R?
    # ...Reading the entire CSV to only get the dimensions is likely too slow. Is there a faster way?
    # Benchmarks done on a EC2 r3.8xlarge
    # Cowritten with Abel Castillo <github.com/abelcastilloavant>

    m <- 1000000
  6. @peterhurford peterhurford revised this gist May 19, 2016. 1 changed file with 7 additions and 7 deletions.
    14 changes: 7 additions & 7 deletions num_rows_csv.R
    Original file line number Diff line number Diff line change
    @@ -12,12 +12,12 @@ pryr::object_size(d)
    readr::write_csv(d, "tmp.csv")

    microbenchmark::microbenchmark(
    { dim(readr::read_csv("tmp.csv")) }, # 394.3ms
    { length(count.fields("tmp.csv")) - 1 }, # 1519.1ms
    { R.utils::countLines("tmp.csv")[[1]] - 1 }, # 1071.3ms
    { length(readLines("tmp.csv")) - 1 }, # 1750.3ms
    { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 }, # 24.9ms
    { dim(readr::read_csv("tmp.csv")) }, # 394.3ms
    { length(count.fields("tmp.csv")) - 1 }, # 1519.1ms
    { R.utils::countLines("tmp.csv")[[1]] - 1 }, # 1071.3ms
    { length(readLines("tmp.csv")) - 1 }, # 1750.3ms
    { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 }, # 24.9ms
    { as.numeric(system("cat tmp.csv | wc -l", intern = TRUE)) - 1 }, # 17.9ms
    { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 }, # 254.8ms
    { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] }, # 2103.0ms
    { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 }, # 254.8ms
    { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] }, # 2103.0ms
    times = 4)
  7. @peterhurford peterhurford revised this gist May 19, 2016. 1 changed file with 8 additions and 7 deletions.
    15 changes: 8 additions & 7 deletions num_rows_csv.R
    Original file line number Diff line number Diff line change
    @@ -12,11 +12,12 @@ pryr::object_size(d)
    readr::write_csv(d, "tmp.csv")

    microbenchmark::microbenchmark(
    { dim(readr::read_csv("tmp.csv")) }, # 394.3ms
    { length(count.fields("tmp.csv")) - 1 }, # 1519.1ms
    { R.utils::countLines("tmp.csv")[[1]] - 1 }, # 1071.3ms
    { length(readLines("tmp.csv")) - 1 }, # 1750.3ms
    { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 }, # 24.9ms
    { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 }, # 254.8ms
    { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] }, # 2103.0ms
    { dim(readr::read_csv("tmp.csv")) }, # 394.3ms
    { length(count.fields("tmp.csv")) - 1 }, # 1519.1ms
    { R.utils::countLines("tmp.csv")[[1]] - 1 }, # 1071.3ms
    { length(readLines("tmp.csv")) - 1 }, # 1750.3ms
    { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 }, # 24.9ms
    { as.numeric(system("cat tmp.csv | wc -l", intern = TRUE)) - 1 }, # 17.9ms
    { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 }, # 254.8ms
    { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] }, # 2103.0ms
    times = 4)
  8. @peterhurford peterhurford revised this gist May 19, 2016. 1 changed file with 4 additions and 0 deletions.
    4 changes: 4 additions & 0 deletions num_rows_csv.R
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,7 @@
    # What's the fastest way to determine the number of rows of a CSV in R?
    # ...Reading the entire CSV to only get the dimensions is likely too slow. Is there a faster way?
    # Cowritten with Abel Castillo <github.com/abelcastilloavant>

    m <- 1000000
    d <- data.frame(id = seq(m), a = rnorm(m), b = runif(m))
    dim(d)
  9. @peterhurford peterhurford created this gist May 19, 2016.
    18 changes: 18 additions & 0 deletions num_rows_csv.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,18 @@
    m <- 1000000
    d <- data.frame(id = seq(m), a = rnorm(m), b = runif(m))
    dim(d)
    # [1] 1000000 3
    pryr::object_size(d)
    # 20 MB

    readr::write_csv(d, "tmp.csv")

    microbenchmark::microbenchmark(
    { dim(readr::read_csv("tmp.csv")) }, # 394.3ms
    { length(count.fields("tmp.csv")) - 1 }, # 1519.1ms
    { R.utils::countLines("tmp.csv")[[1]] - 1 }, # 1071.3ms
    { length(readLines("tmp.csv")) - 1 }, # 1750.3ms
    { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 }, # 24.9ms
    { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 }, # 254.8ms
    { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] }, # 2103.0ms
    times = 4)