Created
October 12, 2019 19:24
-
-
Save mdzhang/5d6ad5927295a0de1b8ac93140281fb1 to your computer and use it in GitHub Desktop.
Revisions
-
mdzhang created this gist
Oct 12, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,156 @@ """Scrape data from NomadList into local CSV. Usage: python scrape.py --cities Austin 'Chiang Mai' Taipei Auckland Ubud 'Buenos Aires' 'Mexico City' """ import argparse import logging import os import re import string import typing as T import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from tabulate import tabulate logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) def load_to_record(city): clean_city = re.sub(r"\s+", "-", city.lower()) url = f"https://nomadlist.com/{clean_city}" driver = webdriver.Firefox() driver.get(url) html_source = driver.page_source driver.close() soup = BeautifulSoup(html_source, "html.parser") nomad_scores = soup.find_all("div", attrs={"class": "tab-ranking"})[0] keys = list( map(lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "key"})) ) values = list( map( lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "value"}) ) ) record = dict(zip(keys, values)) record["city"] = city return record def load_to_df(cities): def skip_fail(city): try: return load_to_record(city) except Exception as exc: logger.exception(f"Failed to fetch city: {city}, {exc}") return None records = list(filter(None, map(skip_fail, cities))) df = pd.DataFrame.from_dict(records) def strip_emojis(s): return "".join(filter(lambda x: x in string.printable, s)).strip() cols = map(lambda col: strip_emojis(col), df.columns) df.columns = cols top_cols = [ "LGBT friendly", "Female friendly", "Safety", "Nomad Score", "Internet", "Walkability", "Traffic safety", "English speaking", "Fun", "Happiness", "Places to work from", "Cost", "city", ] df2 = df[top_cols] # pd.set_option('display.max_columns', None) def extract_cost(df): df2 = df["Cost"].str.split(":", expand=True) cost = ( df2[1] .str.extract(pat=r"\$([\d,]+) \/ mo")[0] .str.replace(",", "") .astype(int) ) return cost df2["Cost"] = extract_cost(df2) def extract_internet(df): df2 = df["Internet"].str.split(":", expand=True) speed = ( df2[1] .str.extract(pat=r"([\d,]+)Mbps \(avg\)")[0] .str.replace(",", "") .astype(int) ) return speed df2["Internet"] = extract_internet(df2) def extract_nomad_score(df): return df2["Nomad Score"].str.split("/", expand=True)[0].astype(float) df2["Nomad Score"] = extract_nomad_score(df2) cat_cols = set(df2.dtypes[df2.dtypes == "object"].index) cat_cols.remove("city") levels = ["Bad", "Okay", "Good", "Great"] df2[cat_cols] = df2[cat_cols].apply( lambda s: s.astype("category").cat.set_categories(levels, ordered=True) ) return df2.sort_values( by=["LGBT friendly", "Female friendly", "Walkability", "Safety"], ascending=False, ) def get_parser(): parser = argparse.ArgumentParser( description="Fetch data from NomadList and write as CSV" ) parser.add_argument("--cities", nargs="+", help="Cities to fetch data on") return parser def main(cities=T.List[str]): cache_file = "nomadlist.csv" if not os.path.exists(cache_file): logger.info(f"Fetching contents for first time '{cache_file}'") df = load_to_df(cities) df.to_csv(cache_file, index=False) else: logger.info(f"Reusing local '{cache_file}'") df = pd.read_csv(cache_file) print(tabulate(df, headers="keys", tablefmt="psql")) if __name__ == "__main__": parser = get_parser() args = parser.parse_args() main(cities=args.cities)