Created
July 30, 2022 11:10
-
-
Save Trusted97/df83cdd169b4aaa3bf79ad60b25d5e62 to your computer and use it in GitHub Desktop.
Revisions
-
Trusted97 created this gist
Jul 30, 2022 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,93 @@ import re from bs4 import BeautifulSoup import urllib.request from urllib.request import urlopen import json import requests def get_embed_code(youtube_url): html = urlopen(youtube_url) # Insert your URL to extract bsObj = BeautifulSoup(html.read(),features="html.parser"); embed_codes = [] #Create list for store embed code for link in bsObj.find_all('iframe'): #Find all iframe in page from given url src = link.get('src') #get src attribute if "https://www.youtube.com/embed/" in src: #get only src attribute that belonge to iframe youtube embed_codes.append(src.replace('https://www.youtube.com/embed/','')) #clean and add iframe code return embed_codes #return the list of codes def get_info_from_embed(embed_code): #get info of the video from youtube endpoint then return a dictionary raw_url = 'https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={EMBED_CODE}&format=json' url = raw_url.replace('{EMBED_CODE}',embed_code) video_info = {} r = requests.head(url) #check if url still exist if r.status_code == 200: req = urllib.request.Request(url) ##parsing response r = urllib.request.urlopen(req).read() cont = json.loads(r.decode('utf-8')) counter = 0 video_info['title'] = cont['title'] video_info['thumb_url'] = cont['thumbnail_url'] return video_info def get_sitemap_section(thumbs_pic_url,title,desc,embed_url): sitemap_section = """ <video:video> <video:thumbnail_loc>{THUMB_PIC_URL}</video:thumbnail_loc> <video:title>{TITLE}</video:title> <video:description>{DESC}</video:description> <video:content_loc>{EMBED_URL}</video:content_loc> <video:family_friendly>yes</video:family_friendly> </video:video>""" formatted_section = sitemap_section.replace('{THUMB_PIC_URL}',thumbs_pic_url).replace('{TITLE}',title).replace('{DESC}',desc).replace('{EMBED_URL}',embed_url) return formatted_section filepath = 'urls.txt' #File that contains the urls that you want add in Video Sitemap song_file = open("sitemap_videos.txt","w") with open(filepath) as fp: line = fp.readline() cnt = 1 while line: s = line.strip() line_component = re.split('\s+', s) for i in range(len(line_component)): currentUrl = line_component[i] embed_codes = get_embed_code(currentUrl) raw_header = '<url><loc>{URL}</loc>' header = raw_header.replace('{URL}',currentUrl) song_file.write(header) for k in range(len(embed_codes)): embed_video_url = 'https://www.youtube.com/embed/'+embed_codes[k] info = get_info_from_embed(embed_codes[k]) #print(currentUrl+'\n') #print(type(info)) #print('\n') if info is not None: title = info.get('title') thumb_url = info.get('thumb_url') desc = 'Custom Text:'+title video_section = get_sitemap_section(thumb_url,title,desc,embed_video_url) song_file.write(video_section) song_file.write('</url>') line = fp.readline() cnt += 1 song_file.close()