Created
July 30, 2022 11:10
-
-
Save Trusted97/df83cdd169b4aaa3bf79ad60b25d5e62 to your computer and use it in GitHub Desktop.
Video sitemap generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| from bs4 import BeautifulSoup | |
| import urllib.request | |
| from urllib.request import urlopen | |
| import json | |
| import requests | |
| def get_embed_code(youtube_url): | |
| html = urlopen(youtube_url) # Insert your URL to extract | |
| bsObj = BeautifulSoup(html.read(),features="html.parser"); | |
| embed_codes = [] #Create list for store embed code | |
| for link in bsObj.find_all('iframe'): #Find all iframe in page from given url | |
| src = link.get('src') #get src attribute | |
| if "https://www.youtube.com/embed/" in src: #get only src attribute that belonge to iframe youtube | |
| embed_codes.append(src.replace('https://www.youtube.com/embed/','')) #clean and add iframe code | |
| return embed_codes #return the list of codes | |
| def get_info_from_embed(embed_code): #get info of the video from youtube endpoint then return a dictionary | |
| raw_url = 'https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={EMBED_CODE}&format=json' | |
| url = raw_url.replace('{EMBED_CODE}',embed_code) | |
| video_info = {} | |
| r = requests.head(url) #check if url still exist | |
| if r.status_code == 200: | |
| req = urllib.request.Request(url) | |
| ##parsing response | |
| r = urllib.request.urlopen(req).read() | |
| cont = json.loads(r.decode('utf-8')) | |
| counter = 0 | |
| video_info['title'] = cont['title'] | |
| video_info['thumb_url'] = cont['thumbnail_url'] | |
| return video_info | |
| def get_sitemap_section(thumbs_pic_url,title,desc,embed_url): | |
| sitemap_section = """ | |
| <video:video> | |
| <video:thumbnail_loc>{THUMB_PIC_URL}</video:thumbnail_loc> | |
| <video:title>{TITLE}</video:title> | |
| <video:description>{DESC}</video:description> | |
| <video:content_loc>{EMBED_URL}</video:content_loc> | |
| <video:family_friendly>yes</video:family_friendly> | |
| </video:video>""" | |
| formatted_section = sitemap_section.replace('{THUMB_PIC_URL}',thumbs_pic_url).replace('{TITLE}',title).replace('{DESC}',desc).replace('{EMBED_URL}',embed_url) | |
| return formatted_section | |
| filepath = 'urls.txt' #File that contains the urls that you want add in Video Sitemap | |
| song_file = open("sitemap_videos.txt","w") | |
| with open(filepath) as fp: | |
| line = fp.readline() | |
| cnt = 1 | |
| while line: | |
| s = line.strip() | |
| line_component = re.split('\s+', s) | |
| for i in range(len(line_component)): | |
| currentUrl = line_component[i] | |
| embed_codes = get_embed_code(currentUrl) | |
| raw_header = '<url><loc>{URL}</loc>' | |
| header = raw_header.replace('{URL}',currentUrl) | |
| song_file.write(header) | |
| for k in range(len(embed_codes)): | |
| embed_video_url = 'https://www.youtube.com/embed/'+embed_codes[k] | |
| info = get_info_from_embed(embed_codes[k]) | |
| #print(currentUrl+'\n') | |
| #print(type(info)) | |
| #print('\n') | |
| if info is not None: | |
| title = info.get('title') | |
| thumb_url = info.get('thumb_url') | |
| desc = 'Custom Text:'+title | |
| video_section = get_sitemap_section(thumb_url,title,desc,embed_video_url) | |
| song_file.write(video_section) | |
| song_file.write('</url>') | |
| line = fp.readline() | |
| cnt += 1 | |
| song_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment