Last active
September 27, 2021 11:07
-
Star
(161)
You must be signed in to star a gist -
Fork
(25)
You must be signed in to fork a gist
-
-
Save grantslatton/7694811 to your computer and use it in GitHub Desktop.
Revisions
-
grantslatton revised this gist
Nov 28, 2013 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,7 +6,7 @@ """ PLEASE DO NOT RUN THIS QUOTED CODE FOR THE SAKE OF daemonology's SERVER, IT IS NOT MY SERVER AND I FEEL BAD FOR ABUSING IT. JUST GET THE RESULTS OF THE CRAWL HERE: http://pastebin.com/raw.php?i=nqpsnTtW AND SAVE THEM TO "archive.txt" archive = open("archive.txt","w") -
grantslatton revised this gist
Nov 28, 2013 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -71,7 +71,8 @@ def sample(items): if sentence in title: flag = False break if flag: sentences.append(sentence) for sentence in sentences: print sentence -
grantslatton revised this gist
Nov 28, 2013 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -5,8 +5,8 @@ from random import random """ PLEASE DO NOT RUN THIS QUOTED CODE FOR THE SAKE OF daemonology's SERVER, IT IS NOT MY SERVER AND I FEEL BAD FOR ABUSING IT. JUST GET THE RESULTS HERE OF THE CRAWL HERE: http://pastebin.com/raw.php?i=nqpsnTtW AND SAVE THEM TO "archive.txt" archive = open("archive.txt","w") -
grantslatton created this gist
Nov 28, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,77 @@ import urllib2 import re import sys from collections import defaultdict from random import random """ PLEASE DO NOT RUN THIS CODE FOR THE SAKE OF daemonology's SERVER, IT IS NOT MY SERVER AND I FEEL BAD FOR ABUSING IT. JUST GET THE RESULTS HERE OF THE CRAWL HERE: http://pastebin.com/raw.php?i=nqpsnTtW AND SAVE THEM TO "archive.txt" archive = open("archive.txt","w") for year in xrange(1,4): for month in xrange(1,13): for day in xrange(1,32): try: print "http://www.daemonology.net/hn-daily/201%d-%02d-%02d.html" % (year, month, day) response = urllib2.urlopen("http://www.daemonology.net/hn-daily/201%d-%02d-%02d.html" % (year, month, day)) html = response.read() titles = re.findall(r'ylink"><[^>]*>([^<]*)', html) for title in titles: archive.write(title+"\n") except: #Invalid dates, could make this less hacky... but... meh pass archive.close() """ archive = open("archive.txt") titles = archive.read().split("\n") archive.close() markov_map = defaultdict(lambda:defaultdict(int)) lookback = 2 #Generate map in the form word1 -> word2 -> occurences of word2 after word1 for title in titles[:-1]: title = title.split() if len(title) > lookback: for i in xrange(len(title)+1): markov_map[' '.join(title[max(0,i-lookback):i])][' '.join(title[i:i+1])] += 1 #Convert map to the word1 -> word2 -> probability of word2 after word1 for word, following in markov_map.items(): total = float(sum(following.values())) for key in following: following[key] /= total #Typical sampling from a categorical distribution def sample(items): next_word = None t = 0.0 for k, v in items: t += v if t and random() < v/t: next_word = k return next_word sentences = [] while len(sentences) < 100: sentence = [] next_word = sample(markov_map[''].items()) while next_word != '': sentence.append(next_word) next_word = sample(markov_map[' '.join(sentence[-lookback:])].items()) sentence = ' '.join(sentence) flag = True for title in titles: #Prune titles that are substrings of actual titles if sentence in title: flag = False break sentences.append(sentence) for sentence in sentences: print sentence