Philip Guther pguther

pguther / remove_url_duplicates

Last active September 29, 2016 17:46

simple python script to remove duplicate site names from a csv file containing a subdomain and url path on each row

	import argparse
	import csv
	from collections import OrderedDict
	import re

	parser = argparse.ArgumentParser(description='Remove duplicate URls from csv of pages mentioning C8.')
	parser.add_argument('file', help='The CSV file to read')

	args = parser.parse_args()

pguther / article-head-meta

Created February 17, 2016 23:51


	## Get XML for current page
	#set($article = $_XPathTool.selectSingleNode($contentRoot, "/system-index-block/calling-page/system-page"))

	## Get the title of the page
	#set($title = $article.getChild("title"))
	#set($link = $article.getChild("path").text)
	#set($summary = $article.getChild("summary"))
	#set($image = $article.getChild("system-data-structure").getChild("lead-image").getChild("image").getChild("path").text)

pguther / articleParser.py

Created December 18, 2015 00:58

	import bs4
	from bs4 import BeautifulSoup
	from unidecode import unidecode
	import urllib
	import re
	import datetime
	import requests
	import argparse

	date_regex = re.compile(r"[A-Za-z]+\s\d{1,2}\,\s\d{4}")

pguther / AZIndex.html

Created November 13, 2015 00:33

	<!DOCTYPE html>
	<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
	<head>

	<!-- BLOCK: uniform campus head -->
	<meta charset="UTF-8"/>
	<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
	<meta content="width=device-width, user-scalable=yes, initial-scale=1.0, minimum-scale=1.0, maximum-scale=2.0" name="viewport"/>
	<link href="../favicon.ico" rel="shortcut icon"/>
	<link href="../apple-touch-icon.png" rel="apple-touch-icon"/>