Skip to content

Instantly share code, notes, and snippets.

@lucasnell
Last active November 2, 2022 17:17
Show Gist options
  • Select an option

  • Save lucasnell/f7d47ee906376a0ade59fc913abd2a29 to your computer and use it in GitHub Desktop.

Select an option

Save lucasnell/f7d47ee906376a0ade59fc913abd2a29 to your computer and use it in GitHub Desktop.

Revisions

  1. lucasnell revised this gist Nov 2, 2022. 1 changed file with 1 addition and 2 deletions.
    3 changes: 1 addition & 2 deletions filter-ecoevojobs.R
    Original file line number Diff line number Diff line change
    @@ -106,8 +106,7 @@ job_df <- ecoevo_df |>
    "Open Rank", "Rank Open"),
    Location %in% locs,
    Institution %in% insts,
    `Review Date` > (Sys.Date() - 7)) |>
    select(Institution, `Subject Area`, `Review Date`, URL, Rank, Notes)
    `Review Date` > (Sys.Date() - 7))


    #'
  2. lucasnell created this gist Nov 2, 2022.
    117 changes: 117 additions & 0 deletions filter-ecoevojobs.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,117 @@

    #'
    #' This does some basic filtering of the ecoevojobs list to narrow down jobs.
    #' Much of the filtering is based on two text files, one containing locations
    #' to filter for (`locations.txt`), and the other for institutions to filter
    #' for (`institutions.txt`).
    #' Each of these have items separated by newlines.
    #' For example, the top of `locations.txt` might look like this:
    #' ```
    #' Alabama
    #' Alaska
    #' Arizona
    #' Arkansas
    #' California
    #' Colorado
    #' ```
    #'

    library(googlesheets4)
    library(tidyverse)
    library(lubridate)

    #' ecoevojobs is public, so no need to authenticate
    gs4_deauth()


    #' Date-time you last manually curated the institution list.
    #' Because I create my institution list based on what's on ecoevojobs,
    #' if there are new jobs that pop up after my curation, then I might miss them
    #' if I used an outdated institutions list.
    #'
    #' ** DON'T JUST CHANGE THIS TO CURRENT DATE-TIME WITHOUT UPDATING **
    #' ** THE `institutions.txt` FILE. **
    #'
    #' See `>>>>>>>>>>>>>>>>>>>>>>>>` below for how to update this
    #'
    curation_dt <- as.POSIXct("2022-11-02 09:24:00", tz = "America/Los_Angeles")

    insts <- read_lines("institutions.txt")

    # List of locations I'm interested in:
    locs <- read_lines("locations.txt") |>
    # In mine, I have some comments that specify how I generated this list,
    # which I want to skip here:
    discard(~ str_starts(.x, "#"))

    #' Fix weird dates in `Review Date` column
    date_fixer <- function(bad_dates) {
    better_dates <- map(bad_dates, function(d) {
    if(is.null(d)) {
    as.Date(NA)
    } else if (is.character(d)) {
    d <- tolower(d)
    abbr_names <- paste(tolower(month.abb), collapse = "|")
    if (str_detect(d, abbr_names)) {
    m_ind <- map_lgl(tolower(month.abb), ~ str_detect(d, .x)) |>
    which()
    if (length(m_ind) > 1) stop("multiple months found")
    d_ind <- case_when(str_detect(d, "mid") ~ 15L,
    str_detect(d, "early") ~ 1L,
    TRUE ~ 15L)
    as.Date(sprintf("%s-%i-%i", year(today()), m_ind, d_ind))
    } else {
    as.Date(NA)
    }
    } else {
    as.Date(d)
    }
    })
    good_dates <- better_dates |>
    unlist() |>
    as.Date(origin = "1970-01-01")
    return(good_dates)
    }

    #' Table of jobs directly from ecoevojobs (this can take a few tries):
    ecoevo_df <- paste0("https://docs.google.com/spreadsheets/d/",
    "1cqTuSeLtH-Zw7X9ZtnhQxzw3r19Rya9nzdqRW9apTmY/edit#gid=865906911") |>
    read_sheet(sheet = "Faculty / Permanent Jobs", skip = 1) |>
    #' With fixed review dates:
    mutate(`Review Date` = date_fixer(`Review Date`))



    #' >>>>>>>>>>>>>>>>>>>>>>>>
    #' How to update curated institutions list.
    #' Run this to identify unique institutions from new job postings,
    #' manually add any you're interested in to `institutions.txt` file,
    #' then re-run the command `insts <- read_lines("institutions.txt")`
    ecoevo_df |>
    filter(Timestamp > curation_dt) |>
    getElement("Institution") |>
    unique() |>
    sort()

    #'
    #' Filter for...
    #' 1. Tenure track posts that allow assistant professor (including open rank)
    #' 2. Location is in your list of desired ones
    #' 3. Institution is in your list of desired ones
    #' 4. Review date can't be more than a week past due
    #'
    job_df <- ecoevo_df |>
    filter(Appointment == "Tenure Track",
    Rank %in% c("Asst / Assoc Prof", "Asst or Assoc Prof", "Asst Prof",
    "Open Rank", "Rank Open"),
    Location %in% locs,
    Institution %in% insts,
    `Review Date` > (Sys.Date() - 7)) |>
    select(Institution, `Subject Area`, `Review Date`, URL, Rank, Notes)


    #'
    #' Now I write this to a CSV file to manually search by subject area.
    #'

    write_csv(job_df, sprintf("filtered_jobs_%s.csv", Sys.Date()))