#!/usr/bin/python # Copyright (c) 2016, Oskar Skog # This file is released under the 2-clause BSD license, # a.k.a. the FreeBSD license # URL: ''' Create joke pages for the unpreferred www and tridecane subdomains. The pages will link to the intended page. See the doc-string for `main` for more information. Try it at: http://tridecane.oskog97.com/read/?path=/tridecane/index.cgi ''' import sys sys.path.append('/var/www') # NOTICE import htmlescape import compressout import os import base64 import re import subprocess import time import cgitb cgitb.enable() #id="global-variables" # Global variables host = 'oskog97.com' proto = 'https' base_dir = 'tridecane' fs_base_dir = '/var/www/tridecane' referer_site = 'http://tridecane.oskog97.com' user_agent = ('Tridecane linkmaker ' + 'https://oskog97.com/read/?path=/tridecane/index.cgi') redirect = [ (r'^/favicon\.ico$', r'/favicon.png'), (r'^/sitemap(.*)\.xml$', r'/sitemap\1.xml'), (r'^/read\.py\?(.*)sitemap=xml(&.*)?$', r'/read.py?sitemap=xml'), (r'^/google(.*)\.html$', r'/google\1.html'), ] # '*' means that QUERY_STRING must not be altered # while a *list* of strings means that only that listed parameters can # be used and should only be used in the specified order. (Mutually # exclusive parameters are NOT handled.) parameters = { '/read/': ['sitemap', 'path', 'download', 'referer', 'title'], '/test.cgi': '*', } # I don't want the system to be overloaded, so I'll put some # restrictions here. maxload = { 'throttle-file': '/var/www/tridecane/throttle', 'throttle-requests': 3, 'throttle-time': 6, 'load-avg-1': 3.5, 'retry-after': 90, '503-file': '/var/www/oops/cgi503.html', } def make_html_link(destination): ''' Create html code (inline, doesn't come pre-wrapped in a p element) with a link to `destination`. `destination` MUST be a valid URL. The link text will be pulled from the target page's title if any. If the target responds with 4xx a link to the homepage will be returned instead. The hostname and protocol for the homepage are defined in the globabl variables `host` and `proto`. This function requires the HEAD(1) and GET(1) command from lwp-request(1). ''' def check_output_nofail(*args): cmd = ' '.join(map( lambda arg: "'" + arg.replace("'", "'\"'\"'") + "'", args )) return subprocess.check_output(cmd + ' || true', shell=True) head = check_output_nofail( 'HEAD', '-H', 'User-Agent: ' + user_agent, '-H', 'Referer: ' + referer_site + os.getenv('REQUEST_URI'), destination, ) # NOTICE: `status` is only the first digit. status = head[0] if status == '4': html = """The page you're looking for doesn't seem to exist. Would you like to go to the homepage instead? """.format(proto + '://' + host + '/') elif status == '5': html = htmlescape.escape( '''<{}> is temporarily malfunctioning. Try again later.''', 2, destination, 1, destination, ) elif status != '2': html = htmlescape.escape( 'Unknown error with <{}>', 2, destination, 1, destination, ) else: ishtml = ( ('Content-Type: text/html' in head) or ('Content-Type: application/xhtml+xml' in head) ) # Fetch the body if HTML. if ishtml: body = check_output_nofail( 'GET', '-H', 'User-Agent: ' + user_agent, '-H', 'Referer: ' + referer_site + os.getenv('REQUEST_URI'), destination, ) else: body = '' # Use the HTML title if available. if '' in body and '' in body: title = body.split('')[1].split('')[0] else: title = htmlescape.escape('{}', 1, destination) # html = htmlescape.escape('''Perhaps you're looking for {} (without the www)?''', 2, destination, 0, title, ) return html def page(request_uri): ''' Print out the page for the (partially) canonicalized `request_uri`. This function assumes `compressout.init` has already been called and that `compressout.done` will be called after returning. ''' xhtml_mime = 'application/xhtml+xml' mime = 'text/html' if xhtml_mime in os.getenv('HTTP_ACCEPT', ''): mime = xhtml_mime compressout.write_h('Status: 404\n') compressout.write_h('Content-Type: {}; charset=UTF-8\n'.format(mime)) compressout.write_h('\n') compressout.write_b(''' Sorry, but I'm not /that/ interested in chemistry

(skeletal formula for tridecane) www.{host}

Sorry, but I'm not that interested in chemistry

(Image): balls and sticks model of tridecane
(Image is taken from Wikipedia. License: CC-BY-SA)

'''.format( host=host, proto=proto, base_dir=base_dir, stylesheet=open(fs_base_dir + '/style.css').read(), )) compressout.write_b('\n'.format( make_html_link(proto + '://' + host + request_uri) )) compressout.write_b('''

The acronym WWW has some interesting properties

\n''') def main(): r''' Handle requests to the tridecane/www subdomain. - /robots.txt is served properly. - Certain URLs can be redirected. - Static pages will have parameters stripped out. - Dynamic pages will have the valid parameters sorted. - Dynamic pages that don't use the usual format for the query string are also supported. This function assumes `compressout.init` has already been called and that `compressout.done` will be called after returning. The global variable `redirect` is a list of tuples of (regex, replacement). The replacement part follows the Python regex syntax with \1 \2 ... as back-references. The global variable `parameters` is a dictionary where the keys are the parts before '?' of the relative URLs to the dynamic pages. The value is either '*' which means that the query string will be untouched, or a list of strings where each string is a valid parameter/variable. The parameters on the canonicalized relative URL will come in the same order as specified in `parameters`. Misc global variables --------------------- `host` The hostname for the canonical site version. `proto` 'http' or 'https' `base_dir` Relative URL without leading and trailing slash; where to find external files (images) for the generated pages. `fs_base_dir` Absolute filesystem path without trailing slash to the directory `base_dir`. (robots.txt and style.css are supposed to be there.) `referer_site` For setting the Referer HTTP header when pulling in the title from the preferred site version. scheme://host (no trailing slash) host is the hostname for the tridecane site. `user_agent` For setting the User-Agent HTTP header when pulling in the title from the preferred site version. `maxload` See the docstring for `overloaded`. ''' request_uri = os.getenv('REQUEST_URI') #query_string = os.getenv('QUERY_STRING', '') # BUG compressout.write_h('Cache-Control: max-age=1209600\n') # /robots.txt if request_uri == '/robots.txt': try: robots_txt = open(fs_base_dir + '/robots.txt').read() compressout.write_h('Content-Type: text/plain\n\n') compressout.write_b(robots_txt.read()) except IOError: compressout.write_h('Status: 404\n\n') return # Deal with redirections. for regex, replacement in redirect: if re.match(regex, request_uri) is not None: destination = re.sub(regex, replacement, request_uri) compressout.write_h('Status: 301\n') compressout.write_h( 'Location: {proto}://{host}{destination}\n'.format( host=host, proto=proto, destination=destination ) ) compressout.write_h('\n') return # Automatically canonicalize parameters. if '?' in request_uri: cgi_name, query_string = request_uri.split('?', 1) if cgi_name not in parameters: # Should not have any parameters. request_uri = cgi_name elif parameters[cgi_name] == '*': # Do not change the request_uri. pass else: # Auto-correct request_uri. valid_parameters = [] for valid in parameters[cgi_name]: if (valid + '=') in query_string: if query_string.startswith(valid + '='): value = query_string.split('=', 1)[1] else: value = query_string.split('&' + valid + '=')[1] value = value.split('&')[0] valid_parameters.append(valid + '=' + value) request_uri = cgi_name + '?' + '&'.join(valid_parameters) # Let `page` print the actual page. page(request_uri) def overloaded(): ''' Prevent over-revving the server. Returns True if a 503 page should be shown, and False if not. The global variable `maxload` is a dictionary containing: `load-avg-1` Maximum average load during the last minute. `throttle-file` A file for recording the times of the last `throttle-requests` requests. Initial content SHOULD be '0\n', ie. a zero. `throttle-requests` The highest allowed number of requests in `throttle-time` seconds. `throttle-time` The shortest allowed time `throttle-requests` requests are allowed to be made. `retry-after` Time in seconds for the Retry-After HTTP header. `503-file` The filesystem path to a static HTML file with a Service Unavailable message. ''' def status503(): compressout.write_h('Status: 503\n') compressout.write_h('Content-Type: text/html; charset=UTF-8') compressout.write_h('Retry-After: {}\n'.format(maxload['retry-after'])) compressout.write_h('\n') compressout.write_b(open(maxload['503-file']).read()) if os.getloadavg()[0] > maxload['load-avg-1']: status503() return True try: access_times = map( float, open(maxload['throttle-file']).read().strip().split(':') ) except: access_times = [0] if time.time() - access_times[-1] < maxload['throttle-time']: status503() return True access_times.insert(0, time.time()) access_times = access_times[:maxload['throttle-requests']] f = open(maxload['throttle-file'], 'w') f.write(':'.join(map(str, access_times)) + '\n') f.close() if __name__ == '__main__': compressout.init() if not overloaded(): main() compressout.done()