Skip to content

Instantly share code, notes, and snippets.

View pguther's full-sized avatar

Philip Guther pguther

  • San Francisco, CA
View GitHub Profile
@pguther
pguther / remove_url_duplicates
Last active September 29, 2016 17:46
simple python script to remove duplicate site names from a csv file containing a subdomain and url path on each row
import argparse
import csv
from collections import OrderedDict
import re
parser = argparse.ArgumentParser(description='Remove duplicate URls from csv of pages mentioning C8.')
parser.add_argument('file', help='The CSV file to read')
args = parser.parse_args()
## Get XML for current page
#set($article = $_XPathTool.selectSingleNode($contentRoot, "/system-index-block/calling-page/system-page"))
## Get the title of the page
#set($title = $article.getChild("title"))
#set($link = $article.getChild("path").text)
#set($summary = $article.getChild("summary"))
#set($image = $article.getChild("system-data-structure").getChild("lead-image").getChild("image").getChild("path").text)
import bs4
from bs4 import BeautifulSoup
from unidecode import unidecode
import urllib
import re
import datetime
import requests
import argparse
date_regex = re.compile(r"[A-Za-z]+\s*\d{1,2}\,\s*\d{4}")
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
<head>
<!-- BLOCK: uniform campus head -->
<meta charset="UTF-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, user-scalable=yes, initial-scale=1.0, minimum-scale=1.0, maximum-scale=2.0" name="viewport"/>
<link href="../favicon.ico" rel="shortcut icon"/>
<link href="../apple-touch-icon.png" rel="apple-touch-icon"/>