Skip to content

Instantly share code, notes, and snippets.

@ruanbekker
Last active January 22, 2024 05:49
Show Gist options
  • Save ruanbekker/bcf5f00a0a8ed7b9c6a7fbac7e92f7e6 to your computer and use it in GitHub Desktop.
Save ruanbekker/bcf5f00a0a8ed7b9c6a7fbac7e92f7e6 to your computer and use it in GitHub Desktop.

Revisions

  1. ruanbekker revised this gist Sep 20, 2017. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions python-sitemap-to-elasticsearch.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,5 @@
    # centos: libxslt-devel python-devel
    # debian:
    import re
    import time
    import requests
  2. ruanbekker revised this gist Apr 16, 2017. 1 changed file with 4 additions and 5 deletions.
    9 changes: 4 additions & 5 deletions python-sitemap-to-elasticsearch.py
    Original file line number Diff line number Diff line change
    @@ -1,7 +1,6 @@
    import re
    import time
    import requests
    import urllib2
    from bs4 import BeautifulSoup
    from elasticsearch import Elasticsearch

    @@ -14,8 +13,8 @@ def urlparser(title, url):
    # scrape title
    p = {}
    post = title
    page = urllib2.urlopen(post)
    soup = BeautifulSoup(page.read(), 'lxml')
    page = requests.get(post).content
    soup = BeautifulSoup(page, 'lxml')
    title_name = soup.title.string

    # scrape tags
    @@ -34,12 +33,12 @@ def urlparser(title, url):

    # ingest payload into elasticsearch
    res = es_client.index(index="myindex-test", doc_type="docs", body=doc)
    time.sleep(1.2)
    time.sleep(0.5)

    sitemap_feed = 'https://sysadmins.co.za/sitemap-posts.xml'
    page = requests.get(sitemap_feed)
    sitemap_index = BeautifulSoup(page.content, 'html.parser')
    urls = [element.text for element in sitemap_index.findAll('loc')]

    for x in urls:
    urlparser(x, x)
    urlparser(x, x)
  3. ruanbekker created this gist Apr 16, 2017.
    45 changes: 45 additions & 0 deletions python-sitemap-to-elasticsearch.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,45 @@
    import re
    import time
    import requests
    import urllib2
    from bs4 import BeautifulSoup
    from elasticsearch import Elasticsearch

    es_client = Elasticsearch(['http://10.0.1.11:9200'])

    drop_index = es_client.indices.create(index='myindex-test', ignore=400)
    create_index = es_client.indices.delete(index='myindex-test', ignore=[400, 404])

    def urlparser(title, url):
    # scrape title
    p = {}
    post = title
    page = urllib2.urlopen(post)
    soup = BeautifulSoup(page.read(), 'lxml')
    title_name = soup.title.string

    # scrape tags
    tag_names = []
    desc = soup.findAll(attrs={"property":"article:tag"})
    for x in xrange(len(desc)):
    tag_names.append(desc[x-1]['content'].encode('utf-8'))

    # payload for elasticsearch
    doc = {
    'date': time.strftime("%Y-%m-%d"),
    'title': title_name,
    'tags': tag_names,
    'url': url
    }

    # ingest payload into elasticsearch
    res = es_client.index(index="myindex-test", doc_type="docs", body=doc)
    time.sleep(1.2)

    sitemap_feed = 'https://sysadmins.co.za/sitemap-posts.xml'
    page = requests.get(sitemap_feed)
    sitemap_index = BeautifulSoup(page.content, 'html.parser')
    urls = [element.text for element in sitemap_index.findAll('loc')]

    for x in urls:
    urlparser(x, x)