Skip to content

Instantly share code, notes, and snippets.

@Trusted97
Created July 30, 2022 11:10
Show Gist options
  • Save Trusted97/df83cdd169b4aaa3bf79ad60b25d5e62 to your computer and use it in GitHub Desktop.
Save Trusted97/df83cdd169b4aaa3bf79ad60b25d5e62 to your computer and use it in GitHub Desktop.

Revisions

  1. Trusted97 created this gist Jul 30, 2022.
    93 changes: 93 additions & 0 deletions video_sitemap_generator.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,93 @@
    import re
    from bs4 import BeautifulSoup
    import urllib.request
    from urllib.request import urlopen
    import json
    import requests



    def get_embed_code(youtube_url):
    html = urlopen(youtube_url) # Insert your URL to extract
    bsObj = BeautifulSoup(html.read(),features="html.parser");
    embed_codes = [] #Create list for store embed code

    for link in bsObj.find_all('iframe'): #Find all iframe in page from given url
    src = link.get('src') #get src attribute

    if "https://www.youtube.com/embed/" in src: #get only src attribute that belonge to iframe youtube
    embed_codes.append(src.replace('https://www.youtube.com/embed/','')) #clean and add iframe code

    return embed_codes #return the list of codes


    def get_info_from_embed(embed_code): #get info of the video from youtube endpoint then return a dictionary
    raw_url = 'https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={EMBED_CODE}&format=json'
    url = raw_url.replace('{EMBED_CODE}',embed_code)

    video_info = {}

    r = requests.head(url) #check if url still exist
    if r.status_code == 200:
    req = urllib.request.Request(url)
    ##parsing response
    r = urllib.request.urlopen(req).read()
    cont = json.loads(r.decode('utf-8'))
    counter = 0
    video_info['title'] = cont['title']
    video_info['thumb_url'] = cont['thumbnail_url']


    return video_info

    def get_sitemap_section(thumbs_pic_url,title,desc,embed_url):
    sitemap_section = """
    <video:video>
    <video:thumbnail_loc>{THUMB_PIC_URL}</video:thumbnail_loc>
    <video:title>{TITLE}</video:title>
    <video:description>{DESC}</video:description>
    <video:content_loc>{EMBED_URL}</video:content_loc>
    <video:family_friendly>yes</video:family_friendly>
    </video:video>"""
    formatted_section = sitemap_section.replace('{THUMB_PIC_URL}',thumbs_pic_url).replace('{TITLE}',title).replace('{DESC}',desc).replace('{EMBED_URL}',embed_url)

    return formatted_section




    filepath = 'urls.txt' #File that contains the urls that you want add in Video Sitemap
    song_file = open("sitemap_videos.txt","w")

    with open(filepath) as fp:
    line = fp.readline()
    cnt = 1
    while line:
    s = line.strip()
    line_component = re.split('\s+', s)
    for i in range(len(line_component)):
    currentUrl = line_component[i]
    embed_codes = get_embed_code(currentUrl)
    raw_header = '<url><loc>{URL}</loc>'
    header = raw_header.replace('{URL}',currentUrl)
    song_file.write(header)
    for k in range(len(embed_codes)):
    embed_video_url = 'https://www.youtube.com/embed/'+embed_codes[k]

    info = get_info_from_embed(embed_codes[k])
    #print(currentUrl+'\n')

    #print(type(info))
    #print('\n')
    if info is not None:
    title = info.get('title')
    thumb_url = info.get('thumb_url')
    desc = 'Custom Text:'+title
    video_section = get_sitemap_section(thumb_url,title,desc,embed_video_url)
    song_file.write(video_section)

    song_file.write('</url>')
    line = fp.readline()
    cnt += 1

    song_file.close()