ktaranov · February 25, 2021 13:17 · Jun 15, 2017 · Mar 4, 2017 · Mar 4, 2017 · May 20, 2016
diff --git a/num_rows_csv.R b/num_rows_csv.R
@@ -13,6 +13,10 @@ pryr::object_size(d)
 readr::write_csv(d, "tmp.csv")
 
 microbenchmark::microbenchmark(
+ {lines <- 0; f <- file("tmp.csv", "r"); while (TRUE) {
+    line <- readLines(f, n = 1)
+    if (length(line) == 0) { break }; lines <- lines + 1}
+    print(lines - 1) },                                                                # 2784.9ms
  { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] },                 # 2103.0ms
  { length(readLines("tmp.csv")) - 1 },                                                 # 1750.3ms
  { length(count.fields("tmp.csv")) - 1 },                                              # 1519.1ms

diff --git a/num_rows_csv.R b/num_rows_csv.R
@@ -24,5 +24,4 @@ microbenchmark::microbenchmark(
  { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 },   # 254.8ms
  { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 },  # 24.9ms
  { as.numeric(system("cat tmp.csv | wc -l", intern = TRUE)) - 1 },                     # 17.9ms
-
  times = 4)
diff --git a/num_rows_csv.R b/num_rows_csv.R
@@ -13,15 +13,16 @@ pryr::object_size(d)
 readr::write_csv(d, "tmp.csv")
 
 microbenchmark::microbenchmark(
- { dim(readr::read_csv("tmp.csv"))[[1]] },                                             # 414.4ms
- { NROW(readr::read_csv("tmp.csv")) },                                                 # 391.7ms
- { dim(data.table::fread("tmp.csv"))[[1]] },                                           # 493.4ms
- { NROW(data.table::fread("tmp.csv")) },                                               # 472.7ms
+ { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] },                 # 2103.0ms
+ { length(readLines("tmp.csv")) - 1 },                                                 # 1750.3ms
  { length(count.fields("tmp.csv")) - 1 },                                              # 1519.1ms
  { R.utils::countLines("tmp.csv")[[1]] - 1 },                                          # 1071.3ms
- { length(readLines("tmp.csv")) - 1 },                                                 # 1750.3ms
+ { dim(data.table::fread("tmp.csv"))[[1]] },                                           # 493.4ms
+ { NROW(data.table::fread("tmp.csv")) },                                               # 472.7ms
+ { dim(readr::read_csv("tmp.csv"))[[1]] },                                             # 414.4ms
+ { NROW(readr::read_csv("tmp.csv")) },                                                 # 391.7ms
+ { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 },   # 254.8ms
  { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 },  # 24.9ms
  { as.numeric(system("cat tmp.csv | wc -l", intern = TRUE)) - 1 },                     # 17.9ms
- { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 },   # 254.8ms
- { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] },                 # 2103.0ms
+
  times = 4)
diff --git a/num_rows_csv.R b/num_rows_csv.R
@@ -13,7 +13,10 @@ pryr::object_size(d)
 readr::write_csv(d, "tmp.csv")
 
 microbenchmark::microbenchmark(
- { dim(readr::read_csv("tmp.csv")) },                                                  # 394.3ms
+ { dim(readr::read_csv("tmp.csv"))[[1]] },                                             # 414.4ms
+ { NROW(readr::read_csv("tmp.csv")) },                                                 # 391.7ms
+ { dim(data.table::fread("tmp.csv"))[[1]] },                                           # 493.4ms
+ { NROW(data.table::fread("tmp.csv")) },                                               # 472.7ms
  { length(count.fields("tmp.csv")) - 1 },                                              # 1519.1ms
  { R.utils::countLines("tmp.csv")[[1]] - 1 },                                          # 1071.3ms
  { length(readLines("tmp.csv")) - 1 },                                                 # 1750.3ms

diff --git a/num_rows_csv.R b/num_rows_csv.R
@@ -1,5 +1,6 @@
 # What's the fastest way to determine the number of rows of a CSV in R?
 # ...Reading the entire CSV to only get the dimensions is likely too slow. Is there a faster way?
+# Benchmarks done on a EC2 r3.8xlarge
 # Cowritten with Abel Castillo <github.com/abelcastilloavant>
 
 m <- 1000000

diff --git a/num_rows_csv.R b/num_rows_csv.R
@@ -12,12 +12,12 @@ pryr::object_size(d)
 readr::write_csv(d, "tmp.csv")
 
 microbenchmark::microbenchmark(
- { dim(readr::read_csv("tmp.csv")) },						 	                                          # 394.3ms
- { length(count.fields("tmp.csv")) - 1 },						                                        # 1519.1ms
- { R.utils::countLines("tmp.csv")[[1]] - 1 },						                                    # 1071.3ms
- { length(readLines("tmp.csv")) - 1 },							                                          # 1750.3ms
- { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 },	 # 24.9ms
+ { dim(readr::read_csv("tmp.csv")) },                                                  # 394.3ms
+ { length(count.fields("tmp.csv")) - 1 },                                              # 1519.1ms
+ { R.utils::countLines("tmp.csv")[[1]] - 1 },                                          # 1071.3ms
+ { length(readLines("tmp.csv")) - 1 },                                                 # 1750.3ms
+ { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 },  # 24.9ms
  { as.numeric(system("cat tmp.csv | wc -l", intern = TRUE)) - 1 },                     # 17.9ms
- { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 },	  # 254.8ms
- { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] },			              # 2103.0ms
+ { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 },   # 254.8ms
+ { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] },                 # 2103.0ms
  times = 4)
diff --git a/num_rows_csv.R b/num_rows_csv.R
@@ -12,11 +12,12 @@ pryr::object_size(d)
 readr::write_csv(d, "tmp.csv")
 
 microbenchmark::microbenchmark(
- { dim(readr::read_csv("tmp.csv")) },						 	# 394.3ms
- { length(count.fields("tmp.csv")) - 1 },						# 1519.1ms
- { R.utils::countLines("tmp.csv")[[1]] - 1 },						# 1071.3ms
- { length(readLines("tmp.csv")) - 1 },							# 1750.3ms
- { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 },	# 24.9ms
- { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 },	# 254.8ms
- { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] },			# 2103.0ms
+ { dim(readr::read_csv("tmp.csv")) },						 	                                          # 394.3ms
+ { length(count.fields("tmp.csv")) - 1 },						                                        # 1519.1ms
+ { R.utils::countLines("tmp.csv")[[1]] - 1 },						                                    # 1071.3ms
+ { length(readLines("tmp.csv")) - 1 },							                                          # 1750.3ms
+ { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 },	 # 24.9ms
+ { as.numeric(system("cat tmp.csv | wc -l", intern = TRUE)) - 1 },                     # 17.9ms
+ { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 },	  # 254.8ms
+ { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] },			              # 2103.0ms
  times = 4)
diff --git a/num_rows_csv.R b/num_rows_csv.R
@@ -1,3 +1,7 @@
+# What's the fastest way to determine the number of rows of a CSV in R?
+# ...Reading the entire CSV to only get the dimensions is likely too slow. Is there a faster way?
+# Cowritten with Abel Castillo <github.com/abelcastilloavant>
+
 m <- 1000000
 d <- data.frame(id = seq(m), a = rnorm(m), b = runif(m))
 dim(d)

diff --git a/num_rows_csv.R b/num_rows_csv.R
@@ -0,0 +1,18 @@
+m <- 1000000
+d <- data.frame(id = seq(m), a = rnorm(m), b = runif(m))
+dim(d)
+# [1] 1000000       3
+pryr::object_size(d)
+# 20 MB
+
+readr::write_csv(d, "tmp.csv")
+
+microbenchmark::microbenchmark(
+ { dim(readr::read_csv("tmp.csv")) },						 	# 394.3ms
+ { length(count.fields("tmp.csv")) - 1 },						# 1519.1ms
+ { R.utils::countLines("tmp.csv")[[1]] - 1 },						# 1071.3ms
+ { length(readLines("tmp.csv")) - 1 },							# 1750.3ms
+ { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 },	# 24.9ms
+ { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 },	# 254.8ms
+ { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] },			# 2103.0ms
+ times = 4)
No results found