ruanbekker · January 22, 2024 05:49 · Sep 20, 2017 · Apr 16, 2017 · Apr 16, 2017
diff --git a/python-sitemap-to-elasticsearch.py b/python-sitemap-to-elasticsearch.py
@@ -1,3 +1,5 @@
+# centos: libxslt-devel python-devel
+# debian: 
 import re
 import time
 import requests

diff --git a/python-sitemap-to-elasticsearch.py b/python-sitemap-to-elasticsearch.py
@@ -1,7 +1,6 @@
 import re
 import time
 import requests
-import urllib2
 from bs4 import BeautifulSoup
 from elasticsearch import Elasticsearch
 
@@ -14,8 +13,8 @@ def urlparser(title, url):
     # scrape title
     p = {}
     post = title
-    page = urllib2.urlopen(post)
-    soup = BeautifulSoup(page.read(), 'lxml')
+    page = requests.get(post).content
+    soup = BeautifulSoup(page, 'lxml')
     title_name = soup.title.string
 
     # scrape tags
@@ -34,12 +33,12 @@ def urlparser(title, url):
 
     # ingest payload into elasticsearch
     res = es_client.index(index="myindex-test", doc_type="docs", body=doc)
-    time.sleep(1.2)
+    time.sleep(0.5)
 
 sitemap_feed = 'https://sysadmins.co.za/sitemap-posts.xml'
 page = requests.get(sitemap_feed)
 sitemap_index = BeautifulSoup(page.content, 'html.parser')
 urls = [element.text for element in sitemap_index.findAll('loc')]
 
 for x in urls:
-    urlparser(x, x)
+    urlparser(x, x)
diff --git a/python-sitemap-to-elasticsearch.py b/python-sitemap-to-elasticsearch.py
@@ -0,0 +1,45 @@
+import re
+import time
+import requests
+import urllib2
+from bs4 import BeautifulSoup
+from elasticsearch import Elasticsearch
+
+es_client = Elasticsearch(['http://10.0.1.11:9200'])
+
+drop_index = es_client.indices.create(index='myindex-test', ignore=400)
+create_index = es_client.indices.delete(index='myindex-test', ignore=[400, 404])
+
+def urlparser(title, url):
+    # scrape title
+    p = {}
+    post = title
+    page = urllib2.urlopen(post)
+    soup = BeautifulSoup(page.read(), 'lxml')
+    title_name = soup.title.string
+
+    # scrape tags
+    tag_names = []
+    desc = soup.findAll(attrs={"property":"article:tag"})
+    for x in xrange(len(desc)):
+        tag_names.append(desc[x-1]['content'].encode('utf-8'))
+
+    # payload for elasticsearch
+    doc = {
+        'date': time.strftime("%Y-%m-%d"),
+        'title': title_name,
+        'tags': tag_names,
+        'url': url
+    }
+
+    # ingest payload into elasticsearch
+    res = es_client.index(index="myindex-test", doc_type="docs", body=doc)
+    time.sleep(1.2)
+
+sitemap_feed = 'https://sysadmins.co.za/sitemap-posts.xml'
+page = requests.get(sitemap_feed)
+sitemap_index = BeautifulSoup(page.content, 'html.parser')
+urls = [element.text for element in sitemap_index.findAll('loc')]
+
+for x in urls:
+    urlparser(x, x)