from bs4 import BeautifulSoup from urllib.parse import urlparse import argparse import datetime import pprint import requests import sys from scripts.webmention.utils import send_webmention def get_post_text(post): if post.get("story_text"): return post["story_text"] elif post.get("comment_text"): return post["comment_text"] else: return "" def main(domain, since_days=7): # https://hn.algolia.com/?dateRange=pastWeek&page=0&prefix=false&query=jeremykun.com&sort=byDate&type=story search_url = ( "https://hn.algolia.com/api/v1/search" f"?query={domain}&tags=story&hitsPerPage=20" f"&numericFilters=created_at_i%3E{int(datetime.datetime.now().timestamp()) - since_days * 24 * 60 * 60}" ) try: r = requests.get(search_url) except requests.exceptions.RequestException as e: print(e) sys.exit(1) response = r.json() pprint.pp(response) num_hits = response["nbHits"] num_pages = response["nbPages"] print(f"Found {num_hits} posts across {num_pages} paginated search pages.") for page in range(0, num_pages): print(f"Querying page {page}") try: r = requests.get(f"{search_url}&page={page}") except requests.exceptions.RequestException as e: print(e) sys.exit(1) response = r.json() hn_posts = response["hits"] for post in hn_posts: created_at = ( datetime.datetime.strptime(post["created_at"], "%Y-%m-%dT%H:%M:%SZ") if "created_at" in post else datetime.datetime.now() ) now = datetime.datetime.now() if (now - created_at).days > since_days: # we already manually handled this webmention with the initial # script run print( f"Skipping post because its publication date ({created_at}) " f"is older than the threshold of {since_days} days since " f"today ({now})." ) continue post_url = "https://news.ycombinator.com/item?id=" + str(post["objectID"]) post_http_url = post.get("url") print(f"Post URL: {post_http_url}") # use 'domain in' because it may be www.jeremykun.com or jeremykun.com if post_http_url is not None and domain in urlparse(post_http_url).netloc: send_webmention(post_url, post_http_url) continue else: parsed = urlparse(post_http_url).netloc print(f"doesn't match {domain} netloc was: {parsed}") story_text = get_post_text(post) content = BeautifulSoup(story_text, "html.parser") links = content.find_all("a") for link in links: if link.get("href") is None: continue post_domain = urlparse(link.get("href")).netloc if post_domain == domain: send_webmention(post_url, link.get("href")) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-d", "--domain") parser.add_argument("-s", "--since_days", type=int, default=7) args = parser.parse_args() main(args.domain, args.since_days)