From d6cc0687924378da1196be86a9d9990727671ef1 Mon Sep 17 00:00:00 2001 From: "R. Tyler Ballance" Date: Sun, 27 Jun 2010 21:24:28 -0700 Subject: [PATCH] Re-organize some of the code for munging HTML documents --- proxylet.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/proxylet.py b/proxylet.py index 9d541d3..6f9556f 100644 --- a/proxylet.py +++ b/proxylet.py @@ -1,21 +1,29 @@ #!/usr/bin/env python from __future__ import with_statement +import base64 import contextlib import gzip import StringIO import time import eventlet +eventlet.monkey_patch() import eventlet.wsgi from eventlet.green import httplib from eventlet.green import urllib2 +import lxml +import lxml.html +import memcache + PROXIED_HEADERS = ('HTTP_USER_AGENT', 'HTTP_ACCEPT_CHARSET', 'HTTP_ACCEPT', 'HTTP_ACCEPT_LANGUAGE', )#'HTTP_COOKIE', 'HTTP_ACCEPT_CHARSET') REDIRECT_CODES = (301, 302, 303,) +CACHE = memcache.Client(('127.0.0.1:11212',)) + def wsgi_ok(start_response, output, headers): start_response('200 OK', [(k, v) for k, v in headers.iteritems()]) return [output] @@ -33,6 +41,57 @@ def fetch_from(method, url, headers): finally: print ('fetch_from(%s, %s, ..) took %s' % (method, url, (time.time() - start))) + +class Munger(object): + def __init__(self, page_content, **kwargs): + self.pool = eventlet.GreenPool() + self.page_content = page_content + self.doc = lxml.html.document_fromstring(page_content) + + def munge(self): + for element in self.doc.getiterator(): + method = '_handle_%s' % element.tag + method = getattr(self, method, None) + if method is None: + continue + self.pool.spawn(method, element) + self.pool.waitall() + return lxml.html.tostring(self.doc) + + def _handle_img(self, elem): + if not elem.attrib.get('src'): + return elem + + source = elem.attrib['src'] + image = fetch_from('GET', source, {}) + image = image.read() + b64image = base64.encodestring(image) + pieces = source.split('.') + elem.attrib['src'] = 'data:image/%s;base64,%s' % (pieces[-1], b64image) + return elem + + def _handle_link(self, elem): + if not elem.attrib.get('href') or not elem.attrib.get('type') == 'text/css': + return elem + + href = elem.attrib['href'] + css = fetch_from('GET', href, {}) + css = css.read() + b64css = base64.encodestring(css) + elem.attrib['href'] = 'data:text/css;base64,%s' % b64css + return elem + + def _ignore_handle_script(self, elem): + if not elem.attrib.get('src'): + return elem + + src = elem.attrib['src'] + js = fetch_from('GET', src, {}) + js = js.read() + b64js = base64.encodestring(js) + elem.attrib['src'] = 'data:text/x-js,%s' % b64js + return elem + def wsgi_proxy(env, start_response): if not env['wsgi.url_scheme'] == 'http': return wsgi_error(start_response, 'Error\r\n', {}) @@ -47,7 +106,18 @@ def wsgi_proxy(env, start_response): if env['QUERY_STRING']: url = '%s?%s' % (url, env['QUERY_STRING']) - response = fetch_from(env['REQUEST_METHOD'], url, headers) + cached = False + #if CACHE.get(url): + if False: + print '>>> Getting %s from the cache' % url + cached = True + + try: + response = fetch_from(env['REQUEST_METHOD'], url, headers) + except urllib2.HTTPError, ex: + start_response('%s %s' % (ex.getcode(), ex.info()), []) + return [''] + headers = dict(response.headers) if response.code in REDIRECT_CODES: @@ -60,7 +130,30 @@ def wsgi_proxy(env, start_response): headers.pop('transfer-encoding', None) print ('headers', headers) response = response.read() + parts = url.split('.') + suffix = parts[-1] + if suffix: + suffix = suffix.split('?')[0] + munger = None + if headers.get('content-type') == 'text/html': + munger = Munger(response) + response = munger.munge() + #if not cached and headers.get('cache-control'): + if False: + parts = headers['cache-control'].split(',') + for part in parts: + part = part.strip() + if not part.startswith('max-age'): + continue + unused, age = part.split('=') + age = int(age) + if age <= 0: + continue + print ('I should cache %s for %ss (%d bytes)' % (url, age, len(response))) + CACHE.set(url, response, time=age) + + print ('Sending proxy response for', url) if response and 'gzip' in env.get('HTTP_ACCEPT_ENCODING', ''): headers['Content-Encoding'] = 'gzip' start = time.time()