Re-organize some of the code for munging HTML documents

This commit is contained in:
R. Tyler Ballance 2010-06-27 21:24:28 -07:00
parent 16273f0665
commit d6cc068792
1 changed files with 94 additions and 1 deletions

View File

@ -1,21 +1,29 @@
#!/usr/bin/env python
from __future__ import with_statement
import base64
import contextlib
import gzip
import StringIO
import time
import eventlet
eventlet.monkey_patch()
import eventlet.wsgi
from eventlet.green import httplib
from eventlet.green import urllib2
import lxml
import lxml.html
import memcache
PROXIED_HEADERS = ('HTTP_USER_AGENT', 'HTTP_ACCEPT_CHARSET', 'HTTP_ACCEPT',
'HTTP_ACCEPT_LANGUAGE', )#'HTTP_COOKIE', 'HTTP_ACCEPT_CHARSET')
REDIRECT_CODES = (301, 302, 303,)
CACHE = memcache.Client(('127.0.0.1:11212',))
def wsgi_ok(start_response, output, headers):
start_response('200 OK', [(k, v) for k, v in headers.iteritems()])
return [output]
@ -33,6 +41,57 @@ def fetch_from(method, url, headers):
finally:
print ('fetch_from(%s, %s, ..) took %s' % (method, url, (time.time() - start)))
class Munger(object):
def __init__(self, page_content, **kwargs):
self.pool = eventlet.GreenPool()
self.page_content = page_content
self.doc = lxml.html.document_fromstring(page_content)
def munge(self):
for element in self.doc.getiterator():
method = '_handle_%s' % element.tag
method = getattr(self, method, None)
if method is None:
continue
self.pool.spawn(method, element)
self.pool.waitall()
return lxml.html.tostring(self.doc)
def _handle_img(self, elem):
if not elem.attrib.get('src'):
return elem
source = elem.attrib['src']
image = fetch_from('GET', source, {})
image = image.read()
b64image = base64.encodestring(image)
pieces = source.split('.')
elem.attrib['src'] = 'data:image/%s;base64,%s' % (pieces[-1], b64image)
return elem
def _handle_link(self, elem):
if not elem.attrib.get('href') or not elem.attrib.get('type') == 'text/css':
return elem
href = elem.attrib['href']
css = fetch_from('GET', href, {})
css = css.read()
b64css = base64.encodestring(css)
elem.attrib['href'] = 'data:text/css;base64,%s' % b64css
return elem
def _ignore_handle_script(self, elem):
if not elem.attrib.get('src'):
return elem
src = elem.attrib['src']
js = fetch_from('GET', src, {})
js = js.read()
b64js = base64.encodestring(js)
elem.attrib['src'] = 'data:text/x-js,%s' % b64js
return elem
def wsgi_proxy(env, start_response):
if not env['wsgi.url_scheme'] == 'http':
return wsgi_error(start_response, 'Error\r\n', {})
@ -47,7 +106,18 @@ def wsgi_proxy(env, start_response):
if env['QUERY_STRING']:
url = '%s?%s' % (url, env['QUERY_STRING'])
response = fetch_from(env['REQUEST_METHOD'], url, headers)
cached = False
#if CACHE.get(url):
if False:
print '>>> Getting %s from the cache' % url
cached = True
try:
response = fetch_from(env['REQUEST_METHOD'], url, headers)
except urllib2.HTTPError, ex:
start_response('%s %s' % (ex.getcode(), ex.info()), [])
return ['']
headers = dict(response.headers)
if response.code in REDIRECT_CODES:
@ -60,7 +130,30 @@ def wsgi_proxy(env, start_response):
headers.pop('transfer-encoding', None)
print ('headers', headers)
response = response.read()
parts = url.split('.')
suffix = parts[-1]
if suffix:
suffix = suffix.split('?')[0]
munger = None
if headers.get('content-type') == 'text/html':
munger = Munger(response)
response = munger.munge()
#if not cached and headers.get('cache-control'):
if False:
parts = headers['cache-control'].split(',')
for part in parts:
part = part.strip()
if not part.startswith('max-age'):
continue
unused, age = part.split('=')
age = int(age)
if age <= 0:
continue
print ('I should cache %s for %ss (%d bytes)' % (url, age, len(response)))
CACHE.set(url, response, time=age)
print ('Sending proxy response for', url)
if response and 'gzip' in env.get('HTTP_ACCEPT_ENCODING', ''):
headers['Content-Encoding'] = 'gzip'
start = time.time()