Skip to content

Instantly share code, notes, and snippets.

@Trusted97
Created July 30, 2022 11:10
Show Gist options
  • Save Trusted97/df83cdd169b4aaa3bf79ad60b25d5e62 to your computer and use it in GitHub Desktop.
Save Trusted97/df83cdd169b4aaa3bf79ad60b25d5e62 to your computer and use it in GitHub Desktop.
Video sitemap generator
import re
from bs4 import BeautifulSoup
import urllib.request
from urllib.request import urlopen
import json
import requests
def get_embed_code(youtube_url):
html = urlopen(youtube_url) # Insert your URL to extract
bsObj = BeautifulSoup(html.read(),features="html.parser");
embed_codes = [] #Create list for store embed code
for link in bsObj.find_all('iframe'): #Find all iframe in page from given url
src = link.get('src') #get src attribute
if "https://www.youtube.com/embed/" in src: #get only src attribute that belonge to iframe youtube
embed_codes.append(src.replace('https://www.youtube.com/embed/','')) #clean and add iframe code
return embed_codes #return the list of codes
def get_info_from_embed(embed_code): #get info of the video from youtube endpoint then return a dictionary
raw_url = 'https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={EMBED_CODE}&format=json'
url = raw_url.replace('{EMBED_CODE}',embed_code)
video_info = {}
r = requests.head(url) #check if url still exist
if r.status_code == 200:
req = urllib.request.Request(url)
##parsing response
r = urllib.request.urlopen(req).read()
cont = json.loads(r.decode('utf-8'))
counter = 0
video_info['title'] = cont['title']
video_info['thumb_url'] = cont['thumbnail_url']
return video_info
def get_sitemap_section(thumbs_pic_url,title,desc,embed_url):
sitemap_section = """
<video:video>
<video:thumbnail_loc>{THUMB_PIC_URL}</video:thumbnail_loc>
<video:title>{TITLE}</video:title>
<video:description>{DESC}</video:description>
<video:content_loc>{EMBED_URL}</video:content_loc>
<video:family_friendly>yes</video:family_friendly>
</video:video>"""
formatted_section = sitemap_section.replace('{THUMB_PIC_URL}',thumbs_pic_url).replace('{TITLE}',title).replace('{DESC}',desc).replace('{EMBED_URL}',embed_url)
return formatted_section
filepath = 'urls.txt' #File that contains the urls that you want add in Video Sitemap
song_file = open("sitemap_videos.txt","w")
with open(filepath) as fp:
line = fp.readline()
cnt = 1
while line:
s = line.strip()
line_component = re.split('\s+', s)
for i in range(len(line_component)):
currentUrl = line_component[i]
embed_codes = get_embed_code(currentUrl)
raw_header = '<url><loc>{URL}</loc>'
header = raw_header.replace('{URL}',currentUrl)
song_file.write(header)
for k in range(len(embed_codes)):
embed_video_url = 'https://www.youtube.com/embed/'+embed_codes[k]
info = get_info_from_embed(embed_codes[k])
#print(currentUrl+'\n')
#print(type(info))
#print('\n')
if info is not None:
title = info.get('title')
thumb_url = info.get('thumb_url')
desc = 'Custom Text:'+title
video_section = get_sitemap_section(thumb_url,title,desc,embed_video_url)
song_file.write(video_section)
song_file.write('</url>')
line = fp.readline()
cnt += 1
song_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment