Created
January 29, 2019 21:07
-
-
Save pshapiro/a86dc340f57c38fc22d0545ddec1fc9e to your computer and use it in GitHub Desktop.
Jupyter Notebook that input outlink from Screaming Frog crawl, grabs PA & DA from Moz API, and uses WHOIS API to determine domain availability.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Expired Domain Finder" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Change the `client` variable to include your Moz API *Access ID* and *Secret Key*. You'll need access to the Moz API." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from mozscape import Mozscape\n", | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "import requests\n", | |
| "import json\n", | |
| "import time\n", | |
| "\n", | |
| "def divide_chunks(l, n): \n", | |
| " for i in range(0, len(l), n): \n", | |
| " yield l[i:i + n] \n", | |
| " \n", | |
| "client = Mozscape('my_access_id', 'my_secret_key')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "The `csv` variable is equal to a an *All Outlinks* report from [Screaming Frog](https://www.screamingfrog.co.uk/seo-spider/)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "csv = pd.read_csv('./all_outlinks.csv', skiprows=1)\n", | |
| "\n", | |
| "links = csv[csv['Type'] == 'AHREF']\n", | |
| "links = csv[~csv['Destination'].str.match('https?://boardgamegeek.com/.*|https?://rpggeek.com/.*|https?://boardgamegeekstore.com/.*|https?://.*.\\.geekdo-.*.com/.*|https?://videogamegeek.com/.*|https?://.*\\.amazon-.*.com.*')]\n", | |
| "\n", | |
| "Domains = links['Destination'].replace(to_replace=\"(.*://)?([^/?]+).*\", value=r\"\\1\\2\", regex=True)\n", | |
| "\n", | |
| "x = list(divide_chunks(Domains.unique().tolist(), 5)) \n", | |
| "\n", | |
| "df = pd.DataFrame(columns=['pda','upa','url','status'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "`headers` is set up spoof the Googlebot user agent to avoid the servers from blocking the status code checks. It is sleeping for 5 seconds for every 5 domains checked with the Moz API." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "{'upa': 20, 'pda': 13, 'url': 'http://www.qmlogistics.com', 'status': 200}\n", | |
| "{'upa': 100, 'pda': 100, 'url': 'https://www.youtube.com', 'status': 200}\n", | |
| "{'upa': 37, 'pda': 73, 'url': 'https://moedaseco.lojaintegrada.com.br', 'status': 200}\n", | |
| "{'upa': 22, 'pda': 19, 'url': 'https://www.eggertspiele.com', 'status': 200}\n", | |
| "{'upa': 80, 'pda': 94, 'url': 'https://www.amazon.co.uk', 'status': 200}\n", | |
| "{'upa': 30, 'pda': 23, 'url': 'https://boardgameprices.co.uk', 'status': 200}\n", | |
| "{'upa': 22, 'pda': 22, 'url': 'http://firestormcards.co.uk', 'status': 200}\n", | |
| "{'upa': 65, 'pda': 83, 'url': 'http://www.boardgamegeek.com', 'status': 200}\n", | |
| "{'upa': 56, 'pda': 68, 'url': 'https://challonge.com', 'status': 403}\n", | |
| "{'upa': 31, 'pda': 28, 'url': 'https://www.gamenerdz.com', 'status': 200}\n", | |
| "{'upa': 40, 'pda': 36, 'url': 'https://www.thebrokentoken.com', 'status': 200}\n", | |
| "{'upa': 50, 'pda': 49, 'url': 'https://www.plaidhatgames.com', 'status': 200}\n", | |
| "{'upa': 1, 'pda': 0, 'url': 'http://www.moedaseco.com.br', 'status': 200}\n", | |
| "{'upa': 41, 'pda': 43, 'url': 'https://www.maydaygames.com', 'status': 200}\n", | |
| "{'upa': 37, 'pda': 35, 'url': 'http://www.summoner.nl', 'status': 200}\n", | |
| "{'upa': 66, 'pda': 94, 'url': 'https://cdn.shopify.com', 'status': 403}\n", | |
| "{'upa': 59, 'pda': 71, 'url': 'https://www.fantasyflightgames.com', 'status': 403}\n", | |
| "{'upa': 59, 'pda': 92, 'url': 'https://media.giphy.com', 'status': 403}\n", | |
| "{'upa': 63, 'pda': 76, 'url': 'https://memegenerator.net', 'status': 200}\n", | |
| "{'upa': 32, 'pda': 30, 'url': 'https://www.planbgames.com', 'status': 200}\n", | |
| "{'upa': 42, 'pda': 37, 'url': 'https://strongholdgames.com', 'status': 200}\n", | |
| "{'upa': 52, 'pda': 58, 'url': 'https://www.yourlogicalfallacyis.com', 'status': 200}\n", | |
| "{'upa': 36, 'pda': 33, 'url': 'http://www.bordspelmania.eu', 'status': 200}\n", | |
| "{'upa': 30, 'pda': 28, 'url': 'http://bordspeler.nl', 'status': 200}\n", | |
| "{'upa': 100, 'pda': 100, 'url': 'https://twitter.com', 'status': 200}\n", | |
| "{'upa': 88, 'pda': 97, 'url': 'https://en.wikipedia.org', 'status': 200}\n", | |
| "{'upa': 52, 'pda': 54, 'url': 'http://www.coolstuffinc.com', 'status': 200}\n", | |
| "{'upa': 65, 'pda': 92, 'url': 'https://i.ytimg.com', 'status': 404}\n", | |
| "{'upa': 91, 'pda': 97, 'url': 'https://www.amazon.com', 'status': 200}\n", | |
| "{'upa': 71, 'pda': 91, 'url': 'https://www.amazon.ca', 'status': 200}\n", | |
| "{'upa': 27, 'pda': 26, 'url': 'http://www.apttogame.com', 'status': 200}\n", | |
| "{'upa': 40, 'pda': 34, 'url': 'http://www.eggertspiele.de', 'status': 200}\n", | |
| "{'upa': 65, 'pda': 93, 'url': 'https://s-media-cache-ak0.pinimg.com', 'status': 403}\n", | |
| "{'upa': 51, 'pda': 70, 'url': 'https://tshaonline.org', 'status': 200}\n", | |
| "{'upa': 85, 'pda': 95, 'url': 'https://www.etsy.com', 'status': 200}\n", | |
| "{'upa': 24, 'pda': 20, 'url': 'https://boardgameinnovation.com', 'status': 200}\n", | |
| "{'upa': 37, 'pda': 36, 'url': 'http://www.boardgamebliss.com', 'status': 200}\n", | |
| "{'upa': 37, 'pda': 42, 'url': 'http://frpgames.com', 'status': 200}\n", | |
| "{'upa': 50, 'pda': 52, 'url': 'http://www.philibertnet.com', 'status': 200}\n", | |
| "{'upa': 39, 'pda': 34, 'url': 'http://www.thirstymeeples.co.uk', 'status': 200}\n", | |
| "{'upa': 54, 'pda': 57, 'url': 'http://www.artscow.com', 'status': 200}\n", | |
| "{'upa': 81, 'pda': 97, 'url': 'https://itunes.apple.com', 'status': 200}\n", | |
| "{'upa': 6, 'pda': 7, 'url': 'http://boardgames.bplaced.net', 'status': 200}\n", | |
| "{'upa': 51, 'pda': 95, 'url': 'https://opinionatedgamers.files.wordpress.com', 'status': 200}\n", | |
| "{'upa': 7, 'pda': 9, 'url': 'http://eggertspiele.bplaced.net', 'status': 403}\n", | |
| "{'upa': 37, 'pda': 37, 'url': 'http://www.strongholdgames.com', 'status': 200}\n", | |
| "{'upa': 62, 'pda': 93, 'url': 'https://i.pinimg.com', 'status': 403}\n", | |
| "{'upa': 26, 'pda': 20, 'url': 'http://www.athenagames.com', 'status': 200}\n", | |
| "{'upa': 28, 'pda': 23, 'url': 'http://boardgamesinsider.com', 'status': 200}\n", | |
| "{'upa': 37, 'pda': 33, 'url': 'http://store.401games.ca', 'status': 200}\n", | |
| "{'upa': 41, 'pda': 46, 'url': 'http://www.boardgamequest.com', 'status': 200}\n", | |
| "{'upa': 33, 'pda': 35, 'url': 'http://brettspielbox.de', 'status': 200}\n", | |
| "{'upa': 25, 'pda': 26, 'url': 'http://www.brettspiel-news.de', 'status': 200}\n", | |
| "{'upa': 68, 'pda': 92, 'url': 'https://pbs.twimg.com', 'status': 400}\n", | |
| "{'upa': 25, 'pda': 36, 'url': 'https://www.cpforbes.net', 'status': 403}\n", | |
| "{'upa': 85, 'pda': 97, 'url': 'http://goo.gl', 'status': 200}\n", | |
| "{'upa': 65, 'pda': 83, 'url': 'https://www.boardgamegeek.com', 'status': 200}\n", | |
| "{'upa': 22, 'pda': 17, 'url': 'http://www.argfx.at', 'status': 200}\n", | |
| "{'upa': 43, 'pda': 42, 'url': 'https://www.blend4web.com', 'status': 200}\n", | |
| "{'upa': 37, 'pda': 32, 'url': 'http://www.plato-magazine.com', 'status': 200}\n", | |
| "{'upa': 35, 'pda': 37, 'url': 'http://www.vindjeu.eu', 'status': 200}\n", | |
| "{'upa': 52, 'pda': 54, 'url': 'https://www.coolstuffinc.com', 'status': 200}\n", | |
| "{'upa': 41, 'pda': 37, 'url': 'http://www.cardhaus.com', 'status': 200}\n", | |
| "{'upa': 53, 'pda': 56, 'url': 'http://www.webhallen.com', 'status': 200}\n", | |
| "{'upa': 48, 'pda': 53, 'url': 'http://www.cowcow.com', 'status': 200}\n", | |
| "{'upa': 67, 'pda': 80, 'url': 'https://www.rotary.org', 'status': 200}\n", | |
| "{'upa': 15, 'pda': 12, 'url': 'http://controlledareagaming.com', 'status': 200}\n", | |
| "{'upa': 75, 'pda': 92, 'url': 'https://www.twitch.tv', 'status': 200}\n", | |
| "{'upa': 78, 'pda': 93, 'url': 'https://www.amazon.de', 'status': 200}\n", | |
| "{'upa': 68, 'pda': 83, 'url': 'http://www.thingiverse.com', 'status': 200}\n", | |
| "{'upa': 23, 'pda': 20, 'url': 'http://www.boardgameinnovation.com', 'status': 200}\n", | |
| "{'upa': 67, 'pda': 95, 'url': 'https://m.imgur.com', 'status': 200}\n", | |
| "{'upa': 82, 'pda': 96, 'url': 'https://play.google.com', 'status': 200}\n", | |
| "{'upa': 1, 'pda': 0, 'url': 'http://concordiascore.azurewebsites.net', 'status': -1}\n", | |
| "{'upa': 48, 'pda': 95, 'url': 'https://thevirginiantv.files.wordpress.com', 'status': 200}\n", | |
| "{'upa': 51, 'pda': 50, 'url': 'http://www.miniaturemarket.com', 'status': 200}\n", | |
| "{'upa': 31, 'pda': 28, 'url': 'http://www.greatboardgames.ca', 'status': 200}\n", | |
| "{'upa': 89, 'pda': 98, 'url': 'https://www.reddit.com', 'status': 429}\n", | |
| "{'upa': 51, 'pda': 52, 'url': 'http://www.pegasus.de', 'status': 200}\n", | |
| "{'upa': 30, 'pda': 25, 'url': 'https://www.topshelfgamer.com', 'status': 200}\n", | |
| "{'upa': 19, 'pda': 16, 'url': 'http://fatcatgaming.co.uk', 'status': 200}\n", | |
| "{'upa': 40, 'pda': 36, 'url': 'http://www.thebrokentoken.com', 'status': 200}\n", | |
| "{'upa': 41, 'pda': 38, 'url': 'http://www.meeplesource.com', 'status': 200}\n", | |
| "{'upa': 79, 'pda': 93, 'url': 'https://www.kickstarter.com', 'status': 200}\n", | |
| "{'upa': 43, 'pda': 39, 'url': 'http://www.eaglegames.net', 'status': 200}\n", | |
| "{'upa': 81, 'pda': 96, 'url': 'https://youtu.be', 'status': 200}\n", | |
| "{'upa': 37, 'pda': 36, 'url': 'https://www.boardgamebliss.com', 'status': 200}\n", | |
| "{'upa': 45, 'pda': 44, 'url': 'http://1856.com', 'status': 200}\n", | |
| "{'upa': 23, 'pda': 18, 'url': 'http://www.unhalfbricking.com', 'status': -1}\n", | |
| "{'upa': 27, 'pda': 18, 'url': 'http://www.boardgamesearch.com.au', 'status': 200}\n", | |
| "{'upa': 46, 'pda': 74, 'url': 'https://m.media-amazon.com', 'status': 400}\n", | |
| "{'upa': 58, 'pda': 91, 'url': 'https://images-na.ssl-images-amazon.com', 'status': 400}\n", | |
| "{'upa': 30, 'pda': 34, 'url': 'http://eggertspiele.de', 'status': 200}\n", | |
| "{'upa': 82, 'pda': 95, 'url': 'https://imgur.com', 'status': 200}\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "headers = {'user-agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}\n", | |
| "\n", | |
| "for vals in x:\n", | |
| " da_pa = client.urlMetrics(vals, Mozscape.UMCols.domainAuthority | Mozscape.UMCols.pageAuthority)\n", | |
| " i = 0\n", | |
| " for y in da_pa:\n", | |
| " y['url'] = vals[i]\n", | |
| " try:\n", | |
| " r = requests.get(vals[i], headers=headers)\n", | |
| " y['status'] = r.status_code\n", | |
| " except requests.exceptions.ConnectionError:\n", | |
| " y['status'] = -1\n", | |
| " i = i+1\n", | |
| " df = df.append(y, ignore_index=True)\n", | |
| " print(y) \n", | |
| " time.sleep(5)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Looking at every domain that shows a status code >= 400 with the `status_code_threshold` variable." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "status_code_threshold = 400\n", | |
| "da_threshold = 25\n", | |
| "error_urls = df[(df['status'] >= status_code_threshold) & (df['pda'] >= da_threshold)]['url'].tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Change the `whois_api_key` variable to be equal to the [Whois XML API](https://main.whoisxmlapi.com/) API key. 500 credits are available for free." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "amazon.co.uk status: UNAVAILABLE\n", | |
| "shopify.com status: UNAVAILABLE\n", | |
| "giphy.com status: UNAVAILABLE\n", | |
| "coolstuffinc.com status: UNAVAILABLE\n", | |
| "ytimg.com status: UNAVAILABLE\n", | |
| "amazon.com status: UNAVAILABLE\n", | |
| "pinimg.com status: AVAILABLE\n", | |
| "pinimg.com status: AVAILABLE\n", | |
| "twimg.com status: UNAVAILABLE\n", | |
| "coolstuffinc.com status: UNAVAILABLE\n", | |
| "challonge.com status: AVAILABLE\n", | |
| "shopify.com status: UNAVAILABLE\n", | |
| "fantasyflightgames.com status: AVAILABLE\n", | |
| "giphy.com status: UNAVAILABLE\n", | |
| "ytimg.com status: UNAVAILABLE\n", | |
| "pinimg.com status: AVAILABLE\n", | |
| "pinimg.com status: UNAVAILABLE\n", | |
| "twimg.com status: UNAVAILABLE\n", | |
| "cpforbes.net status: UNAVAILABLE\n", | |
| "reddit.com status: UNAVAILABLE\n", | |
| "media-amazon.com status: UNAVAILABLE\n", | |
| "ssl-images-amazon.com status: UNAVAILABLE\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "whois_api_key = \"your_key\"\n", | |
| "\n", | |
| "for x in error_urls:\n", | |
| " dnsapi = \"https://www.whoisxmlapi.com/whoisserver/WhoisService?apiKey=\" + whois_api_key + \"&outputFormat=JSON&cmd=GET_DN_AVAILABILITY&domainName=\" + x\n", | |
| " r = requests.get(dnsapi) \n", | |
| " parsed_json = json.loads(r.text)\n", | |
| " print(parsed_json['DomainInfo']['domainName'] + \" status: \" + parsed_json['DomainInfo']['domainAvailability'])" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment