Last active
January 22, 2024 05:49
-
-
Save ruanbekker/bcf5f00a0a8ed7b9c6a7fbac7e92f7e6 to your computer and use it in GitHub Desktop.
Revisions
-
ruanbekker revised this gist
Sep 20, 2017 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,5 @@ # centos: libxslt-devel python-devel # debian: import re import time import requests -
ruanbekker revised this gist
Apr 16, 2017 . 1 changed file with 4 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,7 +1,6 @@ import re import time import requests from bs4 import BeautifulSoup from elasticsearch import Elasticsearch @@ -14,8 +13,8 @@ def urlparser(title, url): # scrape title p = {} post = title page = requests.get(post).content soup = BeautifulSoup(page, 'lxml') title_name = soup.title.string # scrape tags @@ -34,12 +33,12 @@ def urlparser(title, url): # ingest payload into elasticsearch res = es_client.index(index="myindex-test", doc_type="docs", body=doc) time.sleep(0.5) sitemap_feed = 'https://sysadmins.co.za/sitemap-posts.xml' page = requests.get(sitemap_feed) sitemap_index = BeautifulSoup(page.content, 'html.parser') urls = [element.text for element in sitemap_index.findAll('loc')] for x in urls: urlparser(x, x) -
ruanbekker created this gist
Apr 16, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,45 @@ import re import time import requests import urllib2 from bs4 import BeautifulSoup from elasticsearch import Elasticsearch es_client = Elasticsearch(['http://10.0.1.11:9200']) drop_index = es_client.indices.create(index='myindex-test', ignore=400) create_index = es_client.indices.delete(index='myindex-test', ignore=[400, 404]) def urlparser(title, url): # scrape title p = {} post = title page = urllib2.urlopen(post) soup = BeautifulSoup(page.read(), 'lxml') title_name = soup.title.string # scrape tags tag_names = [] desc = soup.findAll(attrs={"property":"article:tag"}) for x in xrange(len(desc)): tag_names.append(desc[x-1]['content'].encode('utf-8')) # payload for elasticsearch doc = { 'date': time.strftime("%Y-%m-%d"), 'title': title_name, 'tags': tag_names, 'url': url } # ingest payload into elasticsearch res = es_client.index(index="myindex-test", doc_type="docs", body=doc) time.sleep(1.2) sitemap_feed = 'https://sysadmins.co.za/sitemap-posts.xml' page = requests.get(sitemap_feed) sitemap_index = BeautifulSoup(page.content, 'html.parser') urls = [element.text for element in sitemap_index.findAll('loc')] for x in urls: urlparser(x, x)