#!/usr/bin/python # Copyright (c) 2016, Oskar Skog # This file is released under the 2-clause BSD license, # a.k.a. the FreeBSD license # URL: ''' Create joke pages for the unpreferred www and tridecane subdomains. The pages will link to the intended page. See the doc-string for `main` for more information. Try it at: http://tridecane.oskog97.com/read/?path=/tridecane/index.cgi ''' import sys sys.path.append('/var/www') # NOTICE import htmlescape import compressout import os import base64 import re import subprocess import time import cgitb cgitb.enable() #id="global-variables" # Global variables host = 'oskog97.com' proto = 'https' base_dir = 'tridecane' fs_base_dir = '/var/www/tridecane' referer_site = 'http://tridecane.oskog97.com' user_agent = ('Tridecane linkmaker ' + 'https://oskog97.com/read/?path=/tridecane/index.cgi') redirect = [ (r'^/favicon\.ico$', r'/favicon.png'), (r'^/sitemap(.*)\.xml$', r'/sitemap\1.xml'), (r'^/read\.py\?(.*)sitemap=xml(&.*)?$', r'/read.py?sitemap=xml'), (r'^/google(.*)\.html$', r'/google\1.html'), ] # '*' means that QUERY_STRING must not be altered # while a *list* of strings means that only that listed parameters can # be used and should only be used in the specified order. (Mutually # exclusive parameters are NOT handled.) parameters = { '/read/': ['sitemap', 'path', 'download', 'referer', 'title'], '/test.cgi': '*', } # I don't want the system to be overloaded, so I'll put some # restrictions here. maxload = { 'throttle-file': '/var/www/tridecane/throttle', 'throttle-requests': 3, 'throttle-time': 6, 'load-avg-1': 3.5, 'retry-after': 90, '503-file': '/var/www/oops/cgi503.html', } def make_html_link(destination): ''' Create html code (inline, doesn't come pre-wrapped in a p element) with a link to `destination`. `destination` MUST be a valid URL. The link text will be pulled from the target page's title if any. If the target responds with 4xx a link to the homepage will be returned instead. The hostname and protocol for the homepage are defined in the globabl variables `host` and `proto`. This function requires the HEAD(1) and GET(1) command from lwp-request(1). ''' def check_output_nofail(*args): cmd = ' '.join(map( lambda arg: "'" + arg.replace("'", "'\"'\"'") + "'", args )) return subprocess.check_output(cmd + ' || true', shell=True) head = check_output_nofail( 'HEAD', '-H', 'User-Agent: ' + user_agent, '-H', 'Referer: ' + referer_site + os.getenv('REQUEST_URI'), destination, ) # NOTICE: `status` is only the first digit. status = head[0] if status == '4': html = """The page you're looking for doesn't seem to exist. Would you like to go to the homepage instead? """.format(proto + '://' + host + '/') elif status == '5': html = htmlescape.escape( '''<{}> is temporarily malfunctioning. Try again later.''', 2, destination, 1, destination, ) elif status != '2': html = htmlescape.escape( 'Unknown error with <{}>', 2, destination, 1, destination, ) else: ishtml = ( ('Content-Type: text/html' in head) or ('Content-Type: application/xhtml+xml' in head) ) # Fetch the body if HTML. if ishtml: body = check_output_nofail( 'GET', '-H', 'User-Agent: ' + user_agent, '-H', 'Referer: ' + referer_site + os.getenv('REQUEST_URI'), destination, ) else: body = '' # Use the HTML title if available. if '' in body and '' in body: title = body.split('')[1].split('')[0] else: title = htmlescape.escape('{}', 1, destination) # html = htmlescape.escape('''Perhaps you're looking for {} (without the www)?''', 2, destination, 0, title, ) return html def page(request_uri): ''' Print out the page for the (partially) canonicalized `request_uri`. This function assumes `compressout.init` has already been called and that `compressout.done` will be called after returning. ''' xhtml_mime = 'application/xhtml+xml' mime = 'text/html' if xhtml_mime in os.getenv('HTTP_ACCEPT', ''): mime = xhtml_mime compressout.write_h('Status: 404\n') compressout.write_h('Content-Type: {}; charset=UTF-8\n'.format(mime)) compressout.write_h('\n') compressout.write_b(''' Sorry, but I'm not /that/ interested in chemistry

.{host}

Sorry, but I'm not that interested in chemistry

(Image is taken from Wikipedia. License: CC-BY-SA)

'''.format( host=host, proto=proto, base_dir=base_dir, stylesheet=open(fs_base_dir + '/style.css').read(), )) compressout.write_b('

\n{}\n

\n'.format( make_html_link(proto + '://' + host + request_uri) )) compressout.write_b('''

The acronym WWW has some interesting properties

It takes approximately as long to pronounce WWW as it takes to pronounce "world wide web".
It looks like the skeletal formula for tridecane, but "tridecane" is definitively shorter when pronounced.
It's used to make many URLs four bytes longer for no good reason.

\n''') def main(): r''' Handle requests to the tridecane/www subdomain. - /robots.txt is served properly. - Certain URLs can be redirected. - Static pages will have parameters stripped out. - Dynamic pages will have the valid parameters sorted. - Dynamic pages that don't use the usual format for the query string are also supported. This function assumes `compressout.init` has already been called and that `compressout.done` will be called after returning. The global variable `redirect` is a list of tuples of (regex, replacement). The replacement part follows the Python regex syntax with \1 \2 ... as back-references. The global variable `parameters` is a dictionary where the keys are the parts before '?' of the relative URLs to the dynamic pages. The value is either '*' which means that the query string will be untouched, or a list of strings where each string is a valid parameter/variable. The parameters on the canonicalized relative URL will come in the same order as specified in `parameters`. Misc global variables --------------------- `host` The hostname for the canonical site version. `proto` 'http' or 'https' `base_dir` Relative URL without leading and trailing slash; where to find external files (images) for the generated pages. `fs_base_dir` Absolute filesystem path without trailing slash to the directory `base_dir`. (robots.txt and style.css are supposed to be there.) `referer_site` For setting the Referer HTTP header when pulling in the title from the preferred site version. scheme://host (no trailing slash) host is the hostname for the tridecane site. `user_agent` For setting the User-Agent HTTP header when pulling in the title from the preferred site version. `maxload` See the docstring for `overloaded`. ''' request_uri = os.getenv('REQUEST_URI') #query_string = os.getenv('QUERY_STRING', '') # BUG compressout.write_h('Cache-Control: max-age=1209600\n') # /robots.txt if request_uri == '/robots.txt': try: robots_txt = open(fs_base_dir + '/robots.txt').read() compressout.write_h('Content-Type: text/plain\n\n') compressout.write_b(robots_txt.read()) except IOError: compressout.write_h('Status: 404\n\n') return # Deal with redirections. for regex, replacement in redirect: if re.match(regex, request_uri) is not None: destination = re.sub(regex, replacement, request_uri) compressout.write_h('Status: 301\n') compressout.write_h( 'Location: {proto}://{host}{destination}\n'.format( host=host, proto=proto, destination=destination ) ) compressout.write_h('\n') return # Automatically canonicalize parameters. if '?' in request_uri: cgi_name, query_string = request_uri.split('?', 1) if cgi_name not in parameters: # Should not have any parameters. request_uri = cgi_name elif parameters[cgi_name] == '*': # Do not change the request_uri. pass else: # Auto-correct request_uri. valid_parameters = [] for valid in parameters[cgi_name]: if (valid + '=') in query_string: if query_string.startswith(valid + '='): value = query_string.split('=', 1)[1] else: value = query_string.split('&' + valid + '=')[1] value = value.split('&')[0] valid_parameters.append(valid + '=' + value) request_uri = cgi_name + '?' + '&'.join(valid_parameters) # Let `page` print the actual page. page(request_uri) def overloaded(): ''' Prevent over-revving the server. Returns True if a 503 page should be shown, and False if not. The global variable `maxload` is a dictionary containing: `load-avg-1` Maximum average load during the last minute. `throttle-file` A file for recording the times of the last `throttle-requests` requests. Initial content SHOULD be '0\n', ie. a zero. `throttle-requests` The highest allowed number of requests in `throttle-time` seconds. `throttle-time` The shortest allowed time `throttle-requests` requests are allowed to be made. `retry-after` Time in seconds for the Retry-After HTTP header. `503-file` The filesystem path to a static HTML file with a Service Unavailable message. ''' def status503(): compressout.write_h('Status: 503\n') compressout.write_h('Content-Type: text/html; charset=UTF-8') compressout.write_h('Retry-After: {}\n'.format(maxload['retry-after'])) compressout.write_h('\n') compressout.write_b(open(maxload['503-file']).read()) if os.getloadavg()[0] > maxload['load-avg-1']: status503() return True try: access_times = map( float, open(maxload['throttle-file']).read().strip().split(':') ) except: access_times = [0] if time.time() - access_times[-1] < maxload['throttle-time']: status503() return True access_times.insert(0, time.time()) access_times = access_times[:maxload['throttle-requests']] f = open(maxload['throttle-file'], 'w') f.write(':'.join(map(str, access_times)) + '\n') f.close() if __name__ == '__main__': compressout.init() if not overloaded(): main() compressout.done()