Trusted97 · July 30, 2022 11:10 · Jul 30, 2022
diff --git a/video_sitemap_generator.py b/video_sitemap_generator.py
@@ -0,0 +1,93 @@
+import re
+from bs4 import BeautifulSoup
+import urllib.request
+from urllib.request import urlopen
+import json
+import requests
+
+
+
+def get_embed_code(youtube_url):
+    html = urlopen(youtube_url) # Insert your URL to extract
+    bsObj = BeautifulSoup(html.read(),features="html.parser");
+    embed_codes = [] #Create list for store embed code
+
+    for link in bsObj.find_all('iframe'): #Find all iframe in page from given url
+        src = link.get('src') #get src attribute
+
+        if "https://www.youtube.com/embed/" in src: #get only src attribute that belonge to iframe youtube
+            embed_codes.append(src.replace('https://www.youtube.com/embed/','')) #clean and add iframe code
+
+    return embed_codes #return the list of codes
+
+
+def get_info_from_embed(embed_code): #get info of the video from youtube endpoint then return a dictionary
+    raw_url = 'https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={EMBED_CODE}&format=json'
+    url = raw_url.replace('{EMBED_CODE}',embed_code)
+
+    video_info = {}
+
+    r = requests.head(url) #check if url still exist
+    if r.status_code == 200:
+        req = urllib.request.Request(url)
+        ##parsing response
+        r = urllib.request.urlopen(req).read()
+        cont = json.loads(r.decode('utf-8'))
+        counter = 0
+        video_info['title'] = cont['title']
+        video_info['thumb_url'] = cont['thumbnail_url']
+
+
+        return video_info
+
+def get_sitemap_section(thumbs_pic_url,title,desc,embed_url):
+    sitemap_section = """
+    <video:video>
+        <video:thumbnail_loc>{THUMB_PIC_URL}</video:thumbnail_loc>
+        <video:title>{TITLE}</video:title>
+        <video:description>{DESC}</video:description>
+        <video:content_loc>{EMBED_URL}</video:content_loc>
+        <video:family_friendly>yes</video:family_friendly>
+    </video:video>"""
+    formatted_section = sitemap_section.replace('{THUMB_PIC_URL}',thumbs_pic_url).replace('{TITLE}',title).replace('{DESC}',desc).replace('{EMBED_URL}',embed_url)
+
+    return formatted_section
+
+
+
+
+filepath = 'urls.txt' #File that contains the urls that you want add in Video Sitemap
+song_file = open("sitemap_videos.txt","w")
+
+with open(filepath) as fp:
+   line = fp.readline()
+   cnt = 1
+   while line:
+       s = line.strip()
+       line_component = re.split('\s+', s)
+       for i in range(len(line_component)):
+           currentUrl = line_component[i]
+           embed_codes = get_embed_code(currentUrl)
+           raw_header = '<url><loc>{URL}</loc>'
+           header = raw_header.replace('{URL}',currentUrl)
+           song_file.write(header)
+           for k in range(len(embed_codes)):
+               embed_video_url = 'https://www.youtube.com/embed/'+embed_codes[k]
+
+               info = get_info_from_embed(embed_codes[k])
+               #print(currentUrl+'\n')
+
+               #print(type(info))
+               #print('\n')
+               if info is not None:
+                   title = info.get('title')
+                   thumb_url = info.get('thumb_url')
+                   desc = 'Custom Text:'+title
+                   video_section = get_sitemap_section(thumb_url,title,desc,embed_video_url)
+                   song_file.write(video_section)
+
+       song_file.write('</url>')
+       line = fp.readline()
+       cnt += 1
+
+song_file.close()