Skip to content

Instantly share code, notes, and snippets.

@mdzhang
Created October 12, 2019 19:24
Show Gist options
  • Select an option

  • Save mdzhang/5d6ad5927295a0de1b8ac93140281fb1 to your computer and use it in GitHub Desktop.

Select an option

Save mdzhang/5d6ad5927295a0de1b8ac93140281fb1 to your computer and use it in GitHub Desktop.

Revisions

  1. mdzhang created this gist Oct 12, 2019.
    156 changes: 156 additions & 0 deletions nomad.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,156 @@
    """Scrape data from NomadList into local CSV.
    Usage:
    python scrape.py --cities Austin 'Chiang Mai' Taipei Auckland Ubud 'Buenos Aires' 'Mexico City'
    """
    import argparse
    import logging
    import os
    import re
    import string
    import typing as T

    import pandas as pd
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from tabulate import tabulate

    logging.basicConfig()
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)


    def load_to_record(city):
    clean_city = re.sub(r"\s+", "-", city.lower())
    url = f"https://nomadlist.com/{clean_city}"

    driver = webdriver.Firefox()

    driver.get(url)

    html_source = driver.page_source

    driver.close()

    soup = BeautifulSoup(html_source, "html.parser")
    nomad_scores = soup.find_all("div", attrs={"class": "tab-ranking"})[0]
    keys = list(
    map(lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "key"}))
    )
    values = list(
    map(
    lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "value"})
    )
    )

    record = dict(zip(keys, values))
    record["city"] = city
    return record


    def load_to_df(cities):
    def skip_fail(city):
    try:
    return load_to_record(city)
    except Exception as exc:
    logger.exception(f"Failed to fetch city: {city}, {exc}")
    return None

    records = list(filter(None, map(skip_fail, cities)))

    df = pd.DataFrame.from_dict(records)

    def strip_emojis(s):
    return "".join(filter(lambda x: x in string.printable, s)).strip()

    cols = map(lambda col: strip_emojis(col), df.columns)
    df.columns = cols

    top_cols = [
    "LGBT friendly",
    "Female friendly",
    "Safety",
    "Nomad Score",
    "Internet",
    "Walkability",
    "Traffic safety",
    "English speaking",
    "Fun",
    "Happiness",
    "Places to work from",
    "Cost",
    "city",
    ]

    df2 = df[top_cols]
    # pd.set_option('display.max_columns', None)

    def extract_cost(df):
    df2 = df["Cost"].str.split(":", expand=True)
    cost = (
    df2[1]
    .str.extract(pat=r"\$([\d,]+) \/ mo")[0]
    .str.replace(",", "")
    .astype(int)
    )
    return cost

    df2["Cost"] = extract_cost(df2)

    def extract_internet(df):
    df2 = df["Internet"].str.split(":", expand=True)
    speed = (
    df2[1]
    .str.extract(pat=r"([\d,]+)Mbps \(avg\)")[0]
    .str.replace(",", "")
    .astype(int)
    )
    return speed

    df2["Internet"] = extract_internet(df2)

    def extract_nomad_score(df):
    return df2["Nomad Score"].str.split("/", expand=True)[0].astype(float)

    df2["Nomad Score"] = extract_nomad_score(df2)

    cat_cols = set(df2.dtypes[df2.dtypes == "object"].index)
    cat_cols.remove("city")
    levels = ["Bad", "Okay", "Good", "Great"]

    df2[cat_cols] = df2[cat_cols].apply(
    lambda s: s.astype("category").cat.set_categories(levels, ordered=True)
    )

    return df2.sort_values(
    by=["LGBT friendly", "Female friendly", "Walkability", "Safety"],
    ascending=False,
    )


    def get_parser():
    parser = argparse.ArgumentParser(
    description="Fetch data from NomadList and write as CSV"
    )
    parser.add_argument("--cities", nargs="+", help="Cities to fetch data on")
    return parser


    def main(cities=T.List[str]):
    cache_file = "nomadlist.csv"

    if not os.path.exists(cache_file):
    logger.info(f"Fetching contents for first time '{cache_file}'")
    df = load_to_df(cities)
    df.to_csv(cache_file, index=False)
    else:
    logger.info(f"Reusing local '{cache_file}'")
    df = pd.read_csv(cache_file)

    print(tabulate(df, headers="keys", tablefmt="psql"))


    if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    main(cities=args.cities)