cnsoft · March 26, 2014 15:27 · Mar 17, 2014 · Mar 17, 2014 · Mar 17, 2014 · Mar 17, 2014
diff --git a/youtube-reverse-proxy.py b/youtube-reverse-proxy.py
@@ -162,4 +162,129 @@ def handle_css(upstream_url, environ, start_response):
     for k, v in response.headers.items():
         if 'set-cookie' == k.lower():
             v = v.replace('domain=.youtube.com;', '')
-
+            headers.append((k, v))
+    start_response(httplib.OK, headers)
+    body = response.read()
+    body = RE_YTIMG.sub(replace_ytimg, body)
+    return [body]
+
+def handle_watch(video_id, environ, start_response):
+    video_url = REDIS.get(video_id)
+    if video_url:
+        LOGGER.info('%s hit cache' % video_id)
+    else:
+        LOGGER.info('get url for movie: %s' % video_id)
+        try:
+            video_url = proc_pool.spawn(get_url, video_id).get()
+        except:
+            LOGGER.exception('failed to get url')
+            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
+            return ['no valid url']
+        if 'googlevideo.com' not in video_url:
+            LOGGER.error('googlevideo.com not in url: %s' % video_url)
+            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
+            return ['no valid url']
+        video_url = video_url.replace('https://', 'http://')
+        history = set()
+        success = False
+        for i in range(3):
+            worker = pick_worker(history)
+            try_url = RE_GOOGLEVIDEO.sub(functools.partial(replace_googlevideo, worker), video_url)
+            if is_url_correct(try_url):
+                video_url = try_url
+                success = True
+                break
+            # else:
+            #     worker[1] = False
+        if not success:
+            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
+            return ['no valid url']
+        REDIS.set(video_id, video_url)
+        REDIS.expire(video_id, 60 * 3)
+        LOGGER.info('got url for movie: %s %s' % (video_id, video_url))
+    start_response(httplib.FOUND, [
+        ('Location', video_url),
+        ('Content-Type', 'text/plain'),
+        ('Cache-Control', 'max-age=180')
+    ])
+    return ['you can use this link to download the movie']
+
+
+def get_url(video_id):
+    if '/' in video_id:
+        raise Exception('evil')
+    return subprocess.check_output(
+        'youtube-dl http://www.youtube.com/watch?v=%s -g'
+        % video_id, shell=True).strip()
+
+
+def serve_forever():
+    try:
+        server = WSGIServer((LISTEN_IP, LISTEN_PORT), handle_request)
+        LOGGER.info('serving HTTP on port %s:%s...' % (LISTEN_IP, LISTEN_PORT))
+    except:
+        LOGGER.exception('failed to start HTTP server on port %s:%s' % (LISTEN_IP, LISTEN_PORT))
+        os._exit(1)
+    server.serve_forever()
+
+
+def pick_worker(history=()):
+    if len(history) >= len(WORKERS):
+        raise Exception('no worker')
+    server_name = random.choice(WORKERS.keys())
+    worker = random.choice(WORKERS[server_name])
+    if not worker[1]:
+        return pick_worker(set(list(history) + [server_name]))
+    return worker
+
+
+def is_url_correct(url):
+    class NoRedirectHandler(urllib2.HTTPRedirectHandler):
+        def http_error_302(self, req, fp, code, msg, headers):
+            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
+            infourl.status = code
+            infourl.code = code
+            return infourl
+
+        http_error_300 = http_error_302
+        http_error_301 = http_error_302
+        http_error_303 = http_error_302
+        http_error_307 = http_error_302
+
+    try:
+        opener = urllib2.build_opener(NoRedirectHandler())
+        response = opener.open(url)
+        response.close()
+        if 200 == response.code:
+            return True
+        else:
+            LOGGER.error('status code %s for url %s' % (response.code, url))
+            return False
+    except:
+        LOGGER.exception('try url failed: %s' % url)
+        return False
+
+
+def refresh_workers():
+    while True:
+        for workers in WORKERS.values():
+            for worker in workers:
+                worker[1] = is_worker_alive(worker[0])
+        LOGGER.info('%s refreshed workers' % datetime.datetime.now())
+        gevent.sleep(60 * 60)
+
+
+def is_worker_alive(worker_host):
+    try:
+        urllib2.urlopen('http://%s/image/i1/vi/tLcfAnN2QgY/mqdefault.jpg' % worker_host, timeout=3).close()
+        LOGGER.info('%s => OK' % worker_host)
+        return True
+    except:
+        LOGGER.info('%s => FAILURE' % worker_host)
+        return False
+
+
+signal.signal(signal.SIGINT, lambda signum, fame: os._exit(0))
+logging.basicConfig(level=logging.DEBUG)
+gevent.spawn(refresh_workers)
+serve_forever()
diff --git a/youtube-reverse-proxy.py b/youtube-reverse-proxy.py
@@ -63,7 +63,7 @@ def get_http_response(code):
 
 
 def replace_ytimg_css(match):
-    return '/23.226.226.92/css/%s' % match.group(1)
+    return '/your-reverse-proxy-ip/css/%s' % match.group(1)
 
 def replace_ytimg_esc(match):
     return '\\/%s\\/image/%s' % (pick_worker()[0], match.group(1))
@@ -153,7 +153,7 @@ def handle(environ, start_response):
     body = body.replace('class="premium-yva-unexpanded"', 'style="display: none;"')
     # body = body.replace('id="masthead-search"', 'style="position: relative; padding: 0; margin-top: 3px; overflow: hidden;"')
     body = body.replace('ad.doubleclick.net', '127.0.0.1')
-    body = body.replace('www.youtube.com', '23.226.226.92')
+    body = body.replace('www.youtube.com', 'your-reverse-proxy-ip')
     return [body]
 
 def handle_css(upstream_url, environ, start_response):

diff --git a/youtube-reverse-proxy.py b/youtube-reverse-proxy.py
@@ -146,7 +146,6 @@ def handle(environ, start_response):
     body = RE_YTIMG.sub(replace_ytimg, body)
     body = RE_GGPHT.sub(replace_ggpht, body)
     # body = body.replace('class="search-form', 'method="POST" class="search-form')
-    body = body.replace('</head>', '<script src="http://exp.jiankongbao.com/loadtrace.php?host_id=12669&style=5&type=0" type="text/javascript"></script></head>')
     body = body.replace('class="video-masthead">', 'style="display: none;">')
     body = body.replace('class="branded-page-v2-top-row">', 'style="display: none;">')
     body = body.replace('style="z-index: 1">', 'style="display: none;">')
@@ -163,129 +162,4 @@ def handle_css(upstream_url, environ, start_response):
     for k, v in response.headers.items():
         if 'set-cookie' == k.lower():
             v = v.replace('domain=.youtube.com;', '')
-        headers.append((k, v))
-    start_response(httplib.OK, headers)
-    body = response.read()
-    body = RE_YTIMG.sub(replace_ytimg, body)
-    return [body]
-
-def handle_watch(video_id, environ, start_response):
-    video_url = REDIS.get(video_id)
-    if video_url:
-        LOGGER.info('%s hit cache' % video_id)
-    else:
-        LOGGER.info('get url for movie: %s' % video_id)
-        try:
-            video_url = proc_pool.spawn(get_url, video_id).get()
-        except:
-            LOGGER.exception('failed to get url')
-            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
-            return ['no valid url']
-        if 'googlevideo.com' not in video_url:
-            LOGGER.error('googlevideo.com not in url: %s' % video_url)
-            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
-            return ['no valid url']
-        video_url = video_url.replace('https://', 'http://')
-        history = set()
-        success = False
-        for i in range(3):
-            worker = pick_worker(history)
-            try_url = RE_GOOGLEVIDEO.sub(functools.partial(replace_googlevideo, worker), video_url)
-            if is_url_correct(try_url):
-                video_url = try_url
-                success = True
-                break
-            # else:
-            #     worker[1] = False
-        if not success:
-            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
-            return ['no valid url']
-        REDIS.set(video_id, video_url)
-        REDIS.expire(video_id, 60 * 3)
-        LOGGER.info('got url for movie: %s %s' % (video_id, video_url))
-    start_response(httplib.FOUND, [
-        ('Location', video_url),
-        ('Content-Type', 'text/plain'),
-        ('Cache-Control', 'max-age=180')
-    ])
-    return ['you can use this link to download the movie']
-
-
-def get_url(video_id):
-    if '/' in video_id:
-        raise Exception('evil')
-    return subprocess.check_output(
-        'youtube-dl http://www.youtube.com/watch?v=%s -g'
-        % video_id, shell=True).strip()
-
-
-def serve_forever():
-    try:
-        server = WSGIServer((LISTEN_IP, LISTEN_PORT), handle_request)
-        LOGGER.info('serving HTTP on port %s:%s...' % (LISTEN_IP, LISTEN_PORT))
-    except:
-        LOGGER.exception('failed to start HTTP server on port %s:%s' % (LISTEN_IP, LISTEN_PORT))
-        os._exit(1)
-    server.serve_forever()
-
-
-def pick_worker(history=()):
-    if len(history) >= len(WORKERS):
-        raise Exception('no worker')
-    server_name = random.choice(WORKERS.keys())
-    worker = random.choice(WORKERS[server_name])
-    if not worker[1]:
-        return pick_worker(set(list(history) + [server_name]))
-    return worker
-
-
-def is_url_correct(url):
-    class NoRedirectHandler(urllib2.HTTPRedirectHandler):
-        def http_error_302(self, req, fp, code, msg, headers):
-            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
-            infourl.status = code
-            infourl.code = code
-            return infourl
-
-        http_error_300 = http_error_302
-        http_error_301 = http_error_302
-        http_error_303 = http_error_302
-        http_error_307 = http_error_302
-
-    try:
-        opener = urllib2.build_opener(NoRedirectHandler())
-        response = opener.open(url)
-        response.close()
-        if 200 == response.code:
-            return True
-        else:
-            LOGGER.error('status code %s for url %s' % (response.code, url))
-            return False
-    except:
-        LOGGER.exception('try url failed: %s' % url)
-        return False
-
-
-def refresh_workers():
-    while True:
-        for workers in WORKERS.values():
-            for worker in workers:
-                worker[1] = is_worker_alive(worker[0])
-        LOGGER.info('%s refreshed workers' % datetime.datetime.now())
-        gevent.sleep(60 * 60)
-
-
-def is_worker_alive(worker_host):
-    try:
-        urllib2.urlopen('http://%s/image/i1/vi/tLcfAnN2QgY/mqdefault.jpg' % worker_host, timeout=3).close()
-        LOGGER.info('%s => OK' % worker_host)
-        return True
-    except:
-        LOGGER.info('%s => FAILURE' % worker_host)
-        return False
-
-
-signal.signal(signal.SIGINT, lambda signum, fame: os._exit(0))
-logging.basicConfig(level=logging.DEBUG)
-gevent.spawn(refresh_workers)
-serve_forever()
+
diff --git a/worker nginx conf b/worker nginx conf
@@ -0,0 +1,34 @@
+	resolver 8.8.8.8;
+	location /video/ {
+                if ($request_uri ~ "^/video/(.+?)/.+") {
+                        set $upstream_host $1.googlevideo.com;
+                        add_header Content-Disposition "attachment; filename=video.mp4;";
+                }
+                rewrite /video/.+?/(.+)$ /$1 break;
+                proxy_buffering off;
+                proxy_pass https://$upstream_host;
+                proxy_set_header Host $upstream_host;
+        }
+
+
+	location /image/ {
+                if ($request_uri ~ "^/image/(.+?)/.+") {
+                        set $upstream_host $1.ytimg.com;
+                }
+                rewrite /image/.+?/(.+)$ /$1 break;
+                proxy_buffering off;
+                proxy_pass http://$upstream_host;
+                proxy_set_header Host $upstream_host;
+
+        }
+
+
+        location /photo/ {
+                if ($request_uri ~ "^/photo/(.+?)/.+") {
+                        set $upstream_host $1.ggpht.com;
+                }
+                rewrite /photo/.+?/(.+)$ /$1 break;
+                proxy_buffering off;
+                proxy_pass http://$upstream_host;
+                proxy_set_header Host $upstream_host;
+        }
diff --git a/youtube-reverse-proxy.py b/youtube-reverse-proxy.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python
+import logging
+import httplib
+import os
+import subprocess
+import socket
+import datetime
+import random
+import signal
+import urllib2
+import urlparse
+import urllib
+import re
+
+from gevent.wsgi import WSGIServer
+import gevent.monkey
+import gevent.pool
+import gevent
+import redis
+import cgi
+import functools
+import Cookie
+
+REDIS = redis.StrictRedis()
+
+gevent.monkey.patch_all(subprocess=True)
+proc_pool = gevent.pool.Pool(size=16)
+
+LOGGER = logging.getLogger(__name__)
+
+WORKERS = {
+} # you need to fill this
+
+LISTEN_IP = ''
+LISTEN_PORT = 3000
+RE_YTIMG_CSS = re.compile(r'/s.ytimg\.com(.*?\.css)', re.IGNORECASE)
+RE_YTIMG_ESC = re.compile(r'\\/([a-zA-Z0-9-]+?)\.ytimg\.com', re.IGNORECASE)
+RE_YTIMG = re.compile(r'/([a-zA-Z0-9-]+?)\.ytimg\.com', re.IGNORECASE)
+RE_GGPHT = re.compile(r'https://([a-zA-Z0-9-]+?)\.ggpht\.com', re.IGNORECASE)
+RE_GOOGLEVIDEO = re.compile(r'/([a-zA-Z0-9-]+?)\.googlevideo\.com', re.IGNORECASE)
+
+def handle_request(environ, start_response):
+    method = environ.get('REQUEST_METHOD')
+    try:
+        lines = handle(environ, lambda status, headers: start_response(get_http_response(status), headers))
+    except:
+        path = environ.get('PATH_INFO', '').strip('/')
+        LOGGER.exception('failed to handle request: %s %s' % (method, path))
+        start_response('500 INTERNAL_SERVER_ERROR', [
+            ('Content-Type', 'text/javascript'),
+            ('Cache-Control', 'no-cache, no-store, must-revalidate'),
+            ('Pragma', 'no-cache'),
+            ('Expires', '0')])
+        lines = ['Retry in 30 minutes']
+    for line in lines:
+        yield line
+
+
+def get_http_response(code):
+    if code not in httplib.responses:
+        return code
+    return '%s %s' % (code, httplib.responses[code])
+
+
+def replace_ytimg_css(match):
+    return '/23.226.226.92/css/%s' % match.group(1)
+
+def replace_ytimg_esc(match):
+    return '\\/%s\\/image/%s' % (pick_worker()[0], match.group(1))
+
+def replace_ytimg(match):
+    return '/%s/image/%s' % (pick_worker()[0], match.group(1))
+
+def replace_ggpht(match):
+    return 'http://%s/photo/%s' % (pick_worker()[0], match.group(1))
+
+def replace_googlevideo(worker, match):
+    return '/%s/video/%s' % (worker[0], match.group(1))
+
+
+def handle(environ, start_response):
+    host = 'youtube.com'
+    path = environ.get('PATH_INFO', '')
+    if '/watch' == path:
+        video_id = urlparse.parse_qs(environ['QUERY_STRING'])['v'][0]
+        return handle_watch(video_id, environ, start_response)
+    if '/watch_videos' == path:
+        video_id = urlparse.parse_qs(environ['QUERY_STRING'])['video_ids'][0].split(',')[0]
+        return handle_watch(video_id, environ, start_response)
+    if path.startswith('/css/'):
+        upstream_url = path.replace('/css/', '')
+        upstream_url = 'http://s.ytimg.com/%s' % upstream_url
+        return handle_css(upstream_url, environ, start_response)
+    if path.startswith('/t/'):
+        domain = path.replace('/t/', '').replace('.js', '')
+        words = 'window.location.href="http://%s";' % domain
+        start_response(httplib.OK, [
+            ('Content-Type', 'text/javascript'),
+            ('Cache-Control', 'no-cache, no-store, must-revalidate'),
+            ('Pragma', 'no-cache'),
+            ('Expires', '0')])
+        return [words]
+    if path.startswith('//'):
+        start_response(httplib.FOUND, [
+            ('Location', 'http:%s' % path)
+        ])
+        return []
+    data = None
+    if 'POST' == environ['REQUEST_METHOD']:
+        if '/results' == path:
+            post_body = cgi.FieldStorage(
+                fp=environ['wsgi.input'],
+                environ=environ,
+                keep_blank_values=True)
+            upstream_url = 'http://youtube.com/results?%s' % urllib.urlencode({'search_query': post_body['search_query'].value})
+        else:
+            data = environ['wsgi.input'].readline()
+            upstream_url = 'http://%s%s' % (host, path)
+    else:
+        upstream_url = 'http://%s%s' % (host, path)
+    if environ['QUERY_STRING']:
+        upstream_url = '%s?%s' % (upstream_url, environ['QUERY_STRING'])
+    LOGGER.info('upstream url: %s' % upstream_url)
+    headers = {}
+    if environ.get('HTTP_COOKIE'):
+        LOGGER.info('cookie is: %s' % environ.get('HTTP_COOKIE'))
+        headers['Cookie'] = environ.get('HTTP_COOKIE')
+    try:
+        response = urllib2.urlopen(urllib2.Request(upstream_url, data=data, headers=headers))
+    except urllib2.HTTPError as e:
+        start_response(e.code, [(k, v) for k, v in e.hdrs.items()])
+        return [e.msg]
+    except:
+        raise
+    headers = []
+    for k, v in response.headers.items():
+        if 'set-cookie' == k.lower():
+            v = v.replace('domain=.youtube.com;', '')
+        if 'x-frame' in k.lower():
+            continue
+        headers.append((k, v))
+    start_response(httplib.OK, headers)
+    body = response.read()
+    body = RE_YTIMG_CSS.sub(replace_ytimg_css, body)
+    body = RE_YTIMG_ESC.sub(replace_ytimg_esc, body)
+    body = RE_YTIMG.sub(replace_ytimg, body)
+    body = RE_GGPHT.sub(replace_ggpht, body)
+    # body = body.replace('class="search-form', 'method="POST" class="search-form')
+    body = body.replace('</head>', '<script src="http://exp.jiankongbao.com/loadtrace.php?host_id=12669&style=5&type=0" type="text/javascript"></script></head>')
+    body = body.replace('class="video-masthead">', 'style="display: none;">')
+    body = body.replace('class="branded-page-v2-top-row">', 'style="display: none;">')
+    body = body.replace('style="z-index: 1">', 'style="display: none;">')
+    body = body.replace('style="z-index: 1;">', 'style="display: none;">')
+    body = body.replace('class="premium-yva-unexpanded"', 'style="display: none;"')
+    # body = body.replace('id="masthead-search"', 'style="position: relative; padding: 0; margin-top: 3px; overflow: hidden;"')
+    body = body.replace('ad.doubleclick.net', '127.0.0.1')
+    body = body.replace('www.youtube.com', '23.226.226.92')
+    return [body]
+
+def handle_css(upstream_url, environ, start_response):
+    response = urllib2.urlopen(urllib2.Request(upstream_url))
+    headers = []
+    for k, v in response.headers.items():
+        if 'set-cookie' == k.lower():
+            v = v.replace('domain=.youtube.com;', '')
+        headers.append((k, v))
+    start_response(httplib.OK, headers)
+    body = response.read()
+    body = RE_YTIMG.sub(replace_ytimg, body)
+    return [body]
+
+def handle_watch(video_id, environ, start_response):
+    video_url = REDIS.get(video_id)
+    if video_url:
+        LOGGER.info('%s hit cache' % video_id)
+    else:
+        LOGGER.info('get url for movie: %s' % video_id)
+        try:
+            video_url = proc_pool.spawn(get_url, video_id).get()
+        except:
+            LOGGER.exception('failed to get url')
+            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
+            return ['no valid url']
+        if 'googlevideo.com' not in video_url:
+            LOGGER.error('googlevideo.com not in url: %s' % video_url)
+            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
+            return ['no valid url']
+        video_url = video_url.replace('https://', 'http://')
+        history = set()
+        success = False
+        for i in range(3):
+            worker = pick_worker(history)
+            try_url = RE_GOOGLEVIDEO.sub(functools.partial(replace_googlevideo, worker), video_url)
+            if is_url_correct(try_url):
+                video_url = try_url
+                success = True
+                break
+            # else:
+            #     worker[1] = False
+        if not success:
+            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
+            return ['no valid url']
+        REDIS.set(video_id, video_url)
+        REDIS.expire(video_id, 60 * 3)
+        LOGGER.info('got url for movie: %s %s' % (video_id, video_url))
+    start_response(httplib.FOUND, [
+        ('Location', video_url),
+        ('Content-Type', 'text/plain'),
+        ('Cache-Control', 'max-age=180')
+    ])
+    return ['you can use this link to download the movie']
+
+
+def get_url(video_id):
+    if '/' in video_id:
+        raise Exception('evil')
+    return subprocess.check_output(
+        'youtube-dl http://www.youtube.com/watch?v=%s -g'
+        % video_id, shell=True).strip()
+
+
+def serve_forever():
+    try:
+        server = WSGIServer((LISTEN_IP, LISTEN_PORT), handle_request)
+        LOGGER.info('serving HTTP on port %s:%s...' % (LISTEN_IP, LISTEN_PORT))
+    except:
+        LOGGER.exception('failed to start HTTP server on port %s:%s' % (LISTEN_IP, LISTEN_PORT))
+        os._exit(1)
+    server.serve_forever()
+
+
+def pick_worker(history=()):
+    if len(history) >= len(WORKERS):
+        raise Exception('no worker')
+    server_name = random.choice(WORKERS.keys())
+    worker = random.choice(WORKERS[server_name])
+    if not worker[1]:
+        return pick_worker(set(list(history) + [server_name]))
+    return worker
+
+
+def is_url_correct(url):
+    class NoRedirectHandler(urllib2.HTTPRedirectHandler):
+        def http_error_302(self, req, fp, code, msg, headers):
+            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
+            infourl.status = code
+            infourl.code = code
+            return infourl
+
+        http_error_300 = http_error_302
+        http_error_301 = http_error_302
+        http_error_303 = http_error_302
+        http_error_307 = http_error_302
+
+    try:
+        opener = urllib2.build_opener(NoRedirectHandler())
+        response = opener.open(url)
+        response.close()
+        if 200 == response.code:
+            return True
+        else:
+            LOGGER.error('status code %s for url %s' % (response.code, url))
+            return False
+    except:
+        LOGGER.exception('try url failed: %s' % url)
+        return False
+
+
+def refresh_workers():
+    while True:
+        for workers in WORKERS.values():
+            for worker in workers:
+                worker[1] = is_worker_alive(worker[0])
+        LOGGER.info('%s refreshed workers' % datetime.datetime.now())
+        gevent.sleep(60 * 60)
+
+
+def is_worker_alive(worker_host):
+    try:
+        urllib2.urlopen('http://%s/image/i1/vi/tLcfAnN2QgY/mqdefault.jpg' % worker_host, timeout=3).close()
+        LOGGER.info('%s => OK' % worker_host)
+        return True
+    except:
+        LOGGER.info('%s => FAILURE' % worker_host)
+        return False
+
+
+signal.signal(signal.SIGINT, lambda signum, fame: os._exit(0))
+logging.basicConfig(level=logging.DEBUG)
+gevent.spawn(refresh_workers)
+serve_forever()