Tridecane page maker
Create joke pages for the unpreferred www and tridecane subdomains. The pages will link to the intended page.
See the doc-string for `main` for more information.
Last modified | |
Lines | 387 |
Parent directory Download CGIread sitemap Main page
Quick links: check_output_nofail global-variables main make_html_link overloaded page
#!/usr/bin/python
# Copyright (c) 2016, Oskar Skog
# This file is released under the 2-clause BSD license,
# a.k.a. the FreeBSD license
# URL: <http://oskog97.com/policy.html#license>
'''
Create joke pages for the unpreferred www and tridecane subdomains.
The pages will link to the intended page.
See the doc-string for `main` for more information.
Try it at: http://tridecane.oskog97.com/read/?path=/tridecane/index.cgi
'''
import sys
sys.path.append('/var/www') # NOTICE
import htmlescape
import compressout
import os
import base64
import re
import subprocess
import time
import cgitb
cgitb.enable()
#id="global-variables"
# Global variables
host = 'oskog97.com'
proto = 'https'
base_dir = 'tridecane'
fs_base_dir = '/var/www/tridecane'
referer_site = 'http://tridecane.oskog97.com'
user_agent = ('Tridecane linkmaker ' +
'https://oskog97.com/read/?path=/tridecane/index.cgi')
redirect = [
(r'^/favicon\.ico$', r'/favicon.png'),
(r'^/sitemap(.*)\.xml$', r'/sitemap\1.xml'),
(r'^/read\.py\?(.*)sitemap=xml(&.*)?$', r'/read.py?sitemap=xml'),
(r'^/google(.*)\.html$', r'/google\1.html'),
]
# '*' means that QUERY_STRING must not be altered
# while a *list* of strings means that only that listed parameters can
# be used and should only be used in the specified order. (Mutually
# exclusive parameters are NOT handled.)
parameters = {
'/read/': ['sitemap', 'path', 'download', 'referer', 'title'],
'/test.cgi': '*',
}
# I don't want the system to be overloaded, so I'll put some
# restrictions here.
maxload = {
'throttle-file': '/var/www/tridecane/throttle',
'throttle-requests': 3,
'throttle-time': 6,
'load-avg-1': 3.5,
'retry-after': 90,
'503-file': '/var/www/oops/cgi503.html',
}
def make_html_link(destination):
'''
Create html code (inline, doesn't come pre-wrapped in a p element)
with a link to `destination`.
`destination` MUST be a valid URL.
The link text will be pulled from the target page's title if any.
If the target responds with 4xx a link to the homepage will be
returned instead. The hostname and protocol for the homepage
are defined in the globabl variables `host` and `proto`.
This function requires the HEAD(1) and GET(1) command from
lwp-request(1).
'''
def check_output_nofail(*args):
cmd = ' '.join(map(
lambda arg: "'" + arg.replace("'", "'\"'\"'") + "'",
args
))
return subprocess.check_output(cmd + ' || true', shell=True)
head = check_output_nofail(
'HEAD',
'-H', 'User-Agent: ' + user_agent,
'-H', 'Referer: ' + referer_site + os.getenv('REQUEST_URI'),
destination,
)
# NOTICE: `status` is only the first digit.
status = head[0]
if status == '4':
html = """The page you're looking for doesn't seem to exist.
Would you like to go to the <a href="{}">homepage</a> instead?
""".format(proto + '://' + host + '/')
elif status == '5':
html = htmlescape.escape(
'''<<a href="{}" rel="nofollow">{}</a>> is temporarily
malfunctioning. Try again later.''',
2, destination,
1, destination,
)
elif status != '2':
html = htmlescape.escape(
'Unknown error with <<a href="{}" rel="nofollow">{}</a>>',
2, destination,
1, destination,
)
else:
ishtml = (
('Content-Type: text/html' in head) or
('Content-Type: application/xhtml+xml' in head)
)
# Fetch the body if HTML.
if ishtml:
body = check_output_nofail(
'GET',
'-H', 'User-Agent: ' + user_agent,
'-H', 'Referer: ' + referer_site + os.getenv('REQUEST_URI'),
destination,
)
else:
body = ''
# Use the HTML title if available.
if '<title>' in body and '</title>' in body:
title = body.split('<title>')[1].split('</title>')[0]
else:
title = htmlescape.escape('{}', 1, destination)
#
html = htmlescape.escape('''Perhaps you're looking for
<a href="{}" rel="nofollow">{}</a> (without the www)?''',
2, destination,
0, title,
)
return html
def page(request_uri):
'''
Print out the page for the (partially) canonicalized `request_uri`.
This function assumes `compressout.init` has already been called
and that `compressout.done` will be called after returning.
'''
xhtml_mime = 'application/xhtml+xml'
mime = 'text/html'
if xhtml_mime in os.getenv('HTTP_ACCEPT', ''):
mime = xhtml_mime
compressout.write_h('Status: 404\n')
compressout.write_h('Content-Type: {}; charset=UTF-8\n'.format(mime))
compressout.write_h('\n')
compressout.write_b('''<!DOCTYPE html>
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8"/>
<meta name="robots" content="noindex"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<link rel="icon" type="image/png"
href="{proto}://{host}/{base_dir}/favicon.png"/>
<style type="text/css">
{stylesheet}
</style>
<title>Sorry, but I'm not /that/ interested in chemistry</title>
</head>
<body>
<p class="skeletal">
<a href="https://en.wikipedia.org/wiki/Tridecane" rel="nofollow"
><img
src="{proto}://{host}/{base_dir}/skeletal.png"
alt="(skeletal formula for tridecane) www"
width="150" height="24"
/></a>.{host}
</p>
<h1>Sorry, but I'm not <em>that</em> interested in chemistry</h1>
<p>
<a href="https://en.wikipedia.org/wiki/Tridecane" rel="nofollow"
><img src="{proto}://{host}/{base_dir}/model.png"
alt="(Image): balls and sticks model of tridecane"
width="800" height="249"
/></a><br/>
<a rel="nofollow"
href="https://en.wikipedia.org/wiki/File:Tridecane_3D_ball-and-stick_model.png"
>(Image is taken from Wikipedia. License: CC-BY-SA)</a>
</p>
'''.format(
host=host, proto=proto, base_dir=base_dir,
stylesheet=open(fs_base_dir + '/style.css').read(),
))
compressout.write_b('<p class="link">\n{}\n</p>\n'.format(
make_html_link(proto + '://' + host + request_uri)
))
compressout.write_b('''
<p>The acronym WWW has some interesting properties</p>
<ul>
<li>
It takes approximately as long to pronounce WWW as it takes
to pronounce "world wide web".
</li>
<li>
It looks like the skeletal formula for tridecane, but
"tridecane" is definitively shorter when pronounced.
</li>
<li>
It's used to make many URLs four bytes longer for no good
reason.
</li>
</ul>
<p class="footer">
Page made by <a rel="nofollow"
href="https://oskog97.com/read/?path=/tridecane/index.cgi&referer=http://tridecane.oskog97.com/&title=Back+to+the+tridecane+page"
>Tridecane</a>.
</p>
</body>
</html>\n''')
def main():
r'''
Handle requests to the tridecane/www subdomain.
- /robots.txt is served properly.
- Certain URLs can be redirected.
- Static pages will have parameters stripped out.
- Dynamic pages will have the valid parameters sorted.
- Dynamic pages that don't use the usual format for the query
string are also supported.
This function assumes `compressout.init` has already been called
and that `compressout.done` will be called after returning.
The global variable `redirect` is a list of tuples of
(regex, replacement). The replacement part follows the Python
regex syntax with \1 \2 ... as back-references.
The global variable `parameters` is a dictionary where the keys
are the parts before '?' of the relative URLs to the dynamic pages.
The value is either '*' which means that the query string will be
untouched, or a list of strings where each string is a valid
parameter/variable. The parameters on the canonicalized relative
URL will come in the same order as specified in `parameters`.
Misc global variables
---------------------
`host` The hostname for the canonical site version.
`proto` 'http' or 'https'
`base_dir` Relative URL without leading and trailing slash;
where to find external files (images) for the
generated pages.
`fs_base_dir` Absolute filesystem path without trailing slash
to the directory `base_dir`. (robots.txt and
style.css are supposed to be there.)
`referer_site` For setting the Referer HTTP header when
pulling in the title from the preferred site
version.
scheme://host (no trailing slash)
host is the hostname for the tridecane site.
`user_agent` For setting the User-Agent HTTP header when
pulling in the title from the preferred site
version.
`maxload` See the docstring for `overloaded`.
'''
request_uri = os.getenv('REQUEST_URI')
#query_string = os.getenv('QUERY_STRING', '') # BUG
compressout.write_h('Cache-Control: max-age=1209600\n')
# /robots.txt
if request_uri == '/robots.txt':
try:
robots_txt = open(fs_base_dir + '/robots.txt').read()
compressout.write_h('Content-Type: text/plain\n\n')
compressout.write_b(robots_txt.read())
except IOError:
compressout.write_h('Status: 404\n\n')
return
# Deal with redirections.
for regex, replacement in redirect:
if re.match(regex, request_uri) is not None:
destination = re.sub(regex, replacement, request_uri)
compressout.write_h('Status: 301\n')
compressout.write_h(
'Location: {proto}://{host}{destination}\n'.format(
host=host, proto=proto, destination=destination
)
)
compressout.write_h('\n')
return
# Automatically canonicalize parameters.
if '?' in request_uri:
cgi_name, query_string = request_uri.split('?', 1)
if cgi_name not in parameters:
# Should not have any parameters.
request_uri = cgi_name
elif parameters[cgi_name] == '*':
# Do not change the request_uri.
pass
else:
# Auto-correct request_uri.
valid_parameters = []
for valid in parameters[cgi_name]:
if (valid + '=') in query_string:
if query_string.startswith(valid + '='):
value = query_string.split('=', 1)[1]
else:
value = query_string.split('&' + valid + '=')[1]
value = value.split('&')[0]
valid_parameters.append(valid + '=' + value)
request_uri = cgi_name + '?' + '&'.join(valid_parameters)
# Let `page` print the actual page.
page(request_uri)
def overloaded():
'''
Prevent over-revving the server.
Returns True if a 503 page should be shown, and False if not.
The global variable `maxload` is a dictionary containing:
`load-avg-1` Maximum average load during the last
minute.
`throttle-file` A file for recording the times of
the last `throttle-requests` requests.
Initial content SHOULD be '0\n', ie. a
zero.
`throttle-requests` The highest allowed number of requests
in `throttle-time` seconds.
`throttle-time` The shortest allowed time
`throttle-requests` requests are
allowed to be made.
`retry-after` Time in seconds for the Retry-After
HTTP header.
`503-file` The filesystem path to a static HTML
file with a Service Unavailable
message.
'''
def status503():
compressout.write_h('Status: 503\n')
compressout.write_h('Content-Type: text/html; charset=UTF-8')
compressout.write_h('Retry-After: {}\n'.format(maxload['retry-after']))
compressout.write_h('\n')
compressout.write_b(open(maxload['503-file']).read())
if os.getloadavg()[0] > maxload['load-avg-1']:
status503()
return True
try:
access_times = map(
float, open(maxload['throttle-file']).read().strip().split(':')
)
except:
access_times = [0]
if time.time() - access_times[-1] < maxload['throttle-time']:
status503()
return True
access_times.insert(0, time.time())
access_times = access_times[:maxload['throttle-requests']]
f = open(maxload['throttle-file'], 'w')
f.write(':'.join(map(str, access_times)) + '\n')
f.close()
if __name__ == '__main__':
compressout.init()
if not overloaded():
main()
compressout.done()