lucasnell · November 2, 2022 17:17
diff --git a/filter-ecoevojobs.R b/filter-ecoevojobs.R

 #'
 #' This does some basic filtering of the ecoevojobs list to narrow down jobs.
 #' Much of the filtering is based on two text files, one containing locations
 #' to filter for (`locations.txt`), and the other for institutions to filter
 #' for (`institutions.txt`).
 #' Each of these have items separated by newlines.
 #' For example, the top of `locations.txt` might look like this:
 #' ```
 #' Alabama
 #' Alaska
 #' Arizona
 #' Arkansas
 #' California
 #' Colorado
 #' ```
 #'

 library(googlesheets4)
 library(tidyverse)
 library(lubridate)

 #' ecoevojobs is public, so no need to authenticate
 gs4_deauth()


 #' Date-time you last manually curated the institution list.
 #' Because I create my institution list based on what's on ecoevojobs,
 #' if there are new jobs that pop up after my curation, then I might miss them
 #' if I used an outdated institutions list.
 #'
 #' ** DON'T JUST CHANGE THIS TO CURRENT DATE-TIME WITHOUT UPDATING  **
 #' ** THE `institutions.txt` FILE.                                  **
 #'
 #' See `>>>>>>>>>>>>>>>>>>>>>>>>` below for how to update this
 #'
 curation_dt <- as.POSIXct("2022-11-02 09:24:00", tz = "America/Los_Angeles")

 insts <- read_lines("institutions.txt")

 # List of locations I'm interested in:
 locs <- read_lines("locations.txt") |>
    # In mine, I have some comments that specify how I generated this list,
    # which I want to skip here:
    discard(~ str_starts(.x, "#"))

 #' Fix weird dates in `Review Date` column
 date_fixer <- function(bad_dates) {
    better_dates <- map(bad_dates, function(d) {
        if(is.null(d)) {
            as.Date(NA)
        } else if (is.character(d)) {
            d <- tolower(d)
            abbr_names <- paste(tolower(month.abb), collapse = "|")
            if (str_detect(d, abbr_names)) {
                m_ind <- map_lgl(tolower(month.abb), ~ str_detect(d, .x)) |>
                    which()
                if (length(m_ind) > 1) stop("multiple months found")
                d_ind <- case_when(str_detect(d, "mid") ~ 15L,
                                   str_detect(d, "early") ~ 1L,
                                   TRUE ~ 15L)
                as.Date(sprintf("%s-%i-%i", year(today()), m_ind, d_ind))
            } else {
                as.Date(NA)
            }
        } else {
            as.Date(d)
        }
    })
    good_dates <- better_dates |>
        unlist() |>
        as.Date(origin = "1970-01-01")
    return(good_dates)
 }

 #' Table of jobs directly from ecoevojobs (this can take a few tries):
 ecoevo_df <- paste0("https://docs.google.com/spreadsheets/d/",
       "1cqTuSeLtH-Zw7X9ZtnhQxzw3r19Rya9nzdqRW9apTmY/edit#gid=865906911") |>
    read_sheet(sheet = "Faculty / Permanent Jobs", skip = 1) |>
    #' With fixed review dates:
    mutate(`Review Date` = date_fixer(`Review Date`))



 #' >>>>>>>>>>>>>>>>>>>>>>>>
 #' How to update curated institutions list.
 #' Run this to identify unique institutions from new job postings,
 #' manually add any you're interested in to `institutions.txt` file,
 #' then re-run the command `insts <- read_lines("institutions.txt")`
 ecoevo_df |>
    filter(Timestamp > curation_dt) |>
    getElement("Institution") |>
    unique() |>
    sort()

 #'
 #' Filter for...
 #' 1. Tenure track posts that allow assistant professor (including open rank)
 #' 2. Location is in your list of desired ones
 #' 3. Institution is in your list of desired ones
 #' 4. Review date can't be more than a week past due
 #'
 job_df <- ecoevo_df |>
    filter(Appointment == "Tenure Track",
           Rank %in% c("Asst / Assoc Prof", "Asst or Assoc Prof", "Asst Prof",
                       "Open Rank", "Rank Open"),
           Location %in% locs,
           Institution %in% insts,
           `Review Date` > (Sys.Date() - 7))


 #'
 #' Now I write this to a CSV file to manually search by subject area.
 #'

 write_csv(job_df, sprintf("filtered_jobs_%s.csv", Sys.Date()))

	#'
	#' This does some basic filtering of the ecoevojobs list to narrow down jobs.
	#' Much of the filtering is based on two text files, one containing locations
	#' to filter for (`locations.txt`), and the other for institutions to filter
	#' for (`institutions.txt`).
	#' Each of these have items separated by newlines.
	#' For example, the top of `locations.txt` might look like this:
	#' ```
	#' Alabama
	#' Alaska
	#' Arizona
	#' Arkansas
	#' California
	#' Colorado
	#' ```
	#'

	library(googlesheets4)
	library(tidyverse)
	library(lubridate)

	#' ecoevojobs is public, so no need to authenticate
	gs4_deauth()


	#' Date-time you last manually curated the institution list.
	#' Because I create my institution list based on what's on ecoevojobs,
	#' if there are new jobs that pop up after my curation, then I might miss them
	#' if I used an outdated institutions list.
	#'
	#' DON'T JUST CHANGE THIS TO CURRENT DATE-TIME WITHOUT UPDATING
	#' THE `institutions.txt` FILE.
	#'
	#' See `>>>>>>>>>>>>>>>>>>>>>>>>` below for how to update this
	#'
	curation_dt <- as.POSIXct("2022-11-02 09:24:00", tz = "America/Los_Angeles")

	insts <- read_lines("institutions.txt")

	# List of locations I'm interested in:
	locs <- read_lines("locations.txt") \|>
	# In mine, I have some comments that specify how I generated this list,
	# which I want to skip here:
	discard(~ str_starts(.x, "#"))

	#' Fix weird dates in `Review Date` column
	date_fixer <- function(bad_dates) {
	better_dates <- map(bad_dates, function(d) {
	if(is.null(d)) {
	as.Date(NA)
	} else if (is.character(d)) {
	d <- tolower(d)
	abbr_names <- paste(tolower(month.abb), collapse = "\|")
	if (str_detect(d, abbr_names)) {
	m_ind <- map_lgl(tolower(month.abb), ~ str_detect(d, .x)) \|>
	which()
	if (length(m_ind) > 1) stop("multiple months found")
	d_ind <- case_when(str_detect(d, "mid") ~ 15L,
	str_detect(d, "early") ~ 1L,
	TRUE ~ 15L)
	as.Date(sprintf("%s-%i-%i", year(today()), m_ind, d_ind))
	} else {
	as.Date(NA)
	}
	} else {
	as.Date(d)
	}
	})
	good_dates <- better_dates \|>
	unlist() \|>
	as.Date(origin = "1970-01-01")
	return(good_dates)
	}

	#' Table of jobs directly from ecoevojobs (this can take a few tries):
	ecoevo_df <- paste0("https://docs.google.com/spreadsheets/d/",
	"1cqTuSeLtH-Zw7X9ZtnhQxzw3r19Rya9nzdqRW9apTmY/edit#gid=865906911") \|>
	read_sheet(sheet = "Faculty / Permanent Jobs", skip = 1) \|>
	#' With fixed review dates:
	mutate(`Review Date` = date_fixer(`Review Date`))



	#' >>>>>>>>>>>>>>>>>>>>>>>>
	#' How to update curated institutions list.
	#' Run this to identify unique institutions from new job postings,
	#' manually add any you're interested in to `institutions.txt` file,
	#' then re-run the command `insts <- read_lines("institutions.txt")`
	ecoevo_df \|>
	filter(Timestamp > curation_dt) \|>
	getElement("Institution") \|>
	unique() \|>
	sort()

	#'
	#' Filter for...
	#' 1. Tenure track posts that allow assistant professor (including open rank)
	#' 2. Location is in your list of desired ones
	#' 3. Institution is in your list of desired ones
	#' 4. Review date can't be more than a week past due
	#'
	job_df <- ecoevo_df \|>
	filter(Appointment == "Tenure Track",
	Rank %in% c("Asst / Assoc Prof", "Asst or Assoc Prof", "Asst Prof",
	"Open Rank", "Rank Open"),
	Location %in% locs,
	Institution %in% insts,
	`Review Date` > (Sys.Date() - 7))


	#'
	#' Now I write this to a CSV file to manually search by subject area.
	#'

	write_csv(job_df, sprintf("filtered_jobs_%s.csv", Sys.Date()))
No results found