mdzhang · October 12, 2019 19:24 · Oct 12, 2019
diff --git a/nomad.py b/nomad.py
@@ -0,0 +1,156 @@
+"""Scrape data from NomadList into local CSV.
+
+Usage:
+    python scrape.py --cities Austin 'Chiang Mai' Taipei Auckland Ubud 'Buenos Aires' 'Mexico City'
+"""
+import argparse
+import logging
+import os
+import re
+import string
+import typing as T
+
+import pandas as pd
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from tabulate import tabulate
+
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+def load_to_record(city):
+    clean_city = re.sub(r"\s+", "-", city.lower())
+    url = f"https://nomadlist.com/{clean_city}"
+
+    driver = webdriver.Firefox()
+
+    driver.get(url)
+
+    html_source = driver.page_source
+
+    driver.close()
+
+    soup = BeautifulSoup(html_source, "html.parser")
+    nomad_scores = soup.find_all("div", attrs={"class": "tab-ranking"})[0]
+    keys = list(
+        map(lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "key"}))
+    )
+    values = list(
+        map(
+            lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "value"})
+        )
+    )
+
+    record = dict(zip(keys, values))
+    record["city"] = city
+    return record
+
+
+def load_to_df(cities):
+    def skip_fail(city):
+        try:
+            return load_to_record(city)
+        except Exception as exc:
+            logger.exception(f"Failed to fetch city: {city}, {exc}")
+            return None
+
+    records = list(filter(None, map(skip_fail, cities)))
+
+    df = pd.DataFrame.from_dict(records)
+
+    def strip_emojis(s):
+        return "".join(filter(lambda x: x in string.printable, s)).strip()
+
+    cols = map(lambda col: strip_emojis(col), df.columns)
+    df.columns = cols
+
+    top_cols = [
+        "LGBT friendly",
+        "Female friendly",
+        "Safety",
+        "Nomad Score",
+        "Internet",
+        "Walkability",
+        "Traffic safety",
+        "English speaking",
+        "Fun",
+        "Happiness",
+        "Places to work from",
+        "Cost",
+        "city",
+    ]
+
+    df2 = df[top_cols]
+    # pd.set_option('display.max_columns', None)
+
+    def extract_cost(df):
+        df2 = df["Cost"].str.split(":", expand=True)
+        cost = (
+            df2[1]
+            .str.extract(pat=r"\$([\d,]+) \/ mo")[0]
+            .str.replace(",", "")
+            .astype(int)
+        )
+        return cost
+
+    df2["Cost"] = extract_cost(df2)
+
+    def extract_internet(df):
+        df2 = df["Internet"].str.split(":", expand=True)
+        speed = (
+            df2[1]
+            .str.extract(pat=r"([\d,]+)Mbps \(avg\)")[0]
+            .str.replace(",", "")
+            .astype(int)
+        )
+        return speed
+
+    df2["Internet"] = extract_internet(df2)
+
+    def extract_nomad_score(df):
+        return df2["Nomad Score"].str.split("/", expand=True)[0].astype(float)
+
+    df2["Nomad Score"] = extract_nomad_score(df2)
+
+    cat_cols = set(df2.dtypes[df2.dtypes == "object"].index)
+    cat_cols.remove("city")
+    levels = ["Bad", "Okay", "Good", "Great"]
+
+    df2[cat_cols] = df2[cat_cols].apply(
+        lambda s: s.astype("category").cat.set_categories(levels, ordered=True)
+    )
+
+    return df2.sort_values(
+        by=["LGBT friendly", "Female friendly", "Walkability", "Safety"],
+        ascending=False,
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Fetch data from NomadList and write as CSV"
+    )
+    parser.add_argument("--cities", nargs="+", help="Cities to fetch data on")
+    return parser
+
+
+def main(cities=T.List[str]):
+    cache_file = "nomadlist.csv"
+
+    if not os.path.exists(cache_file):
+        logger.info(f"Fetching contents for first time '{cache_file}'")
+        df = load_to_df(cities)
+        df.to_csv(cache_file, index=False)
+    else:
+        logger.info(f"Reusing local '{cache_file}'")
+        df = pd.read_csv(cache_file)
+
+    print(tabulate(df, headers="keys", tablefmt="psql"))
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(cities=args.cities)
No results found