Tridecane page maker

Create joke pages for the unpreferred www and tridecane subdomains. The pages will link to the intended page.

See the doc-string for `main` for more information.

Or try it.

Last modified
Lines 387

Parent directory Download CGIread sitemap Main page

Quick links: check_output_nofail global-variables main make_html_link overloaded page

  1. #!/usr/bin/python
  2. # Copyright (c) 2016, Oskar Skog
  3. # This file is released under the 2-clause BSD license,
  4. # a.k.a. the FreeBSD license
  5. # URL: <http://oskog97.com/policy.html#license>
  6. '''
  7. Create joke pages for the unpreferred www and tridecane subdomains.
  8. The pages will link to the intended page.
  9. See the doc-string for `main` for more information.
  10. Try it at: http://tridecane.oskog97.com/read/?path=/tridecane/index.cgi
  11. '''
  12. import sys
  13. sys.path.append('/var/www') # NOTICE
  14. import htmlescape
  15. import compressout
  16. import os
  17. import base64
  18. import re
  19. import subprocess
  20. import time
  21. import cgitb
  22. cgitb.enable()
  23. #id="global-variables"
  24. # Global variables
  25. host = 'oskog97.com'
  26. proto = 'https'
  27. base_dir = 'tridecane'
  28. fs_base_dir = '/var/www/tridecane'
  29. referer_site = 'http://tridecane.oskog97.com'
  30. user_agent = ('Tridecane linkmaker ' +
  31.     'https://oskog97.com/read/?path=/tridecane/index.cgi')
  32. redirect = [
  33.     (r'^/favicon\.ico$', r'/favicon.png'),
  34.     (r'^/sitemap(.*)\.xml$', r'/sitemap\1.xml'),
  35.     (r'^/read\.py\?(.*)sitemap=xml(&.*)?$', r'/read.py?sitemap=xml'),
  36.     (r'^/google(.*)\.html$', r'/google\1.html'),
  37. ]
  38. # '*' means that QUERY_STRING must not be altered
  39. # while a *list* of strings means that only that listed parameters can
  40. # be used and should only be used in the specified order.  (Mutually
  41. # exclusive parameters are NOT handled.)
  42. parameters = {
  43.     '/read/': ['sitemap', 'path', 'download', 'referer', 'title'],
  44.     '/test.cgi': '*',
  45. }
  46. # I don't want the system to be overloaded, so I'll put some
  47. # restrictions here.
  48. maxload = {
  49.     'throttle-file': '/var/www/tridecane/throttle',
  50.     'throttle-requests': 3,
  51.     'throttle-time': 6,
  52.     'load-avg-1': 3.5,
  53.     'retry-after': 90,
  54.     '503-file': '/var/www/oops/cgi503.html',
  55. }
  56.     '''
  57.     Create html code (inline, doesn't come pre-wrapped in a p element)
  58.     with a link to `destination`.
  59.     
  60.     `destination` MUST be a valid URL.
  61.     
  62.     The link text will be pulled from the target page's title if any.
  63.     
  64.     If the target responds with 4xx a link to the homepage will be
  65.     returned instead.  The hostname and protocol for the homepage
  66.     are defined in the globabl variables `host` and `proto`.
  67.     
  68.     This function requires the HEAD(1) and GET(1) command from
  69.     lwp-request(1).
  70.     '''
  71.     def check_output_nofail(*args):
  72.         cmd = ' '.join(map(
  73.             lambda arg: "'" + arg.replace("'", "'\"'\"'") + "'",
  74.             args
  75.         ))
  76.         return subprocess.check_output(cmd + ' || true', shell=True)
  77.         
  78.     head = check_output_nofail(
  79.         'HEAD',
  80.         '-H', 'User-Agent: ' + user_agent,
  81.         '-H', 'Referer: ' + referer_site + os.getenv('REQUEST_URI'),
  82.         destination,
  83.     )
  84.     # NOTICE: `status` is only the first digit.
  85.     status = head[0]
  86.     if status == '4':
  87.         html = """The page you're looking for doesn't seem to exist.
  88.             Would you like to go to the <a href="{}">homepage</a> instead?
  89.             """.format(proto + '://' + host + '/')
  90.     elif status == '5':
  91.         html = htmlescape.escape(
  92.             '''&lt;<a href="{}" rel="nofollow">{}</a>&gt; is temporarily
  93.             malfunctioning.  Try again later.''',
  94.             2, destination,
  95.             1, destination,
  96.         )
  97.     elif status != '2':
  98.         html = htmlescape.escape(
  99.             'Unknown error with &lt;<a href="{}" rel="nofollow">{}</a>&gt;',
  100.             2, destination,
  101.             1, destination,
  102.         )
  103.     else:
  104.         ishtml = (
  105.             ('Content-Type: text/html' in head) or
  106.             ('Content-Type: application/xhtml+xml' in head)
  107.         )
  108.         # Fetch the body if HTML.
  109.         if ishtml:
  110.             body = check_output_nofail(
  111.                 'GET',
  112.                 '-H', 'User-Agent: ' + user_agent,
  113.                 '-H', 'Referer: ' + referer_site + os.getenv('REQUEST_URI'),
  114.                 destination,
  115.             )
  116.         else:
  117.             body = ''
  118.         # Use the HTML title if available.
  119.         if '<title>' in body and '</title>' in body:
  120.             title = body.split('<title>')[1].split('</title>')[0]
  121.         else:
  122.             title = htmlescape.escape('{}', 1, destination)
  123.         #
  124.         html = htmlescape.escape('''Perhaps you're looking for
  125.             <a href="{}" rel="nofollow">{}</a> (without the www)?''',
  126.             2, destination,
  127.             0, title,
  128.         )
  129.     return html
  130.     
  131. def page(request_uri):
  132.     '''
  133.     Print out the page for the (partially) canonicalized `request_uri`.
  134.     
  135.     This function assumes `compressout.init` has already been called
  136.     and that `compressout.done` will be called after returning.
  137.     '''
  138.     xhtml_mime = 'application/xhtml+xml'
  139.     mime = 'text/html'
  140.     if xhtml_mime in os.getenv('HTTP_ACCEPT', ''):
  141.         mime = xhtml_mime
  142.     compressout.write_h('Status: 404\n')
  143.     compressout.write_h('Content-Type: {}; charset=UTF-8\n'.format(mime))
  144.     compressout.write_h('\n')
  145.     compressout.write_b('''<!DOCTYPE html>
  146. <html lang="en" xmlns="http://www.w3.org/1999/xhtml">
  147.     <head>
  148.         <meta charset="utf-8"/>
  149.         <meta name="robots" content="noindex"/>
  150.         <meta name="viewport" content="width=device-width, initial-scale=1"/>
  151.         <link rel="icon" type="image/png"
  152.             href="{proto}://{host}/{base_dir}/favicon.png"/>
  153.         <style type="text/css">
  154.             {stylesheet}
  155.         </style>
  156.         <title>Sorry, but I'm not /that/ interested in chemistry</title>
  157.     </head>
  158.     <body>
  159.         <p class="skeletal">
  160.             <a href="https://en.wikipedia.org/wiki/Tridecane" rel="nofollow"
  161.             ><img 
  162.                 src="{proto}://{host}/{base_dir}/skeletal.png"
  163.                 alt="(skeletal formula for tridecane) www"
  164.                 width="150" height="24"
  165.             /></a>.{host}
  166.         </p>
  167.         <h1>Sorry, but I'm not <em>that</em> interested in chemistry</h1>
  168.         <p>
  169.             <a href="https://en.wikipedia.org/wiki/Tridecane" rel="nofollow"
  170.             ><img src="{proto}://{host}/{base_dir}/model.png"
  171.                 alt="(Image): balls and sticks model of tridecane"
  172.                 width="800" height="249"
  173.             /></a><br/>
  174.             <a rel="nofollow"
  175. href="https://en.wikipedia.org/wiki/File:Tridecane_3D_ball-and-stick_model.png"
  176.             >(Image is taken from Wikipedia. License: CC-BY-SA)</a>
  177.         </p>
  178. '''.format(
  179.         host=host, proto=proto, base_dir=base_dir,
  180.         stylesheet=open(fs_base_dir + '/style.css').read(),
  181.     ))
  182.     compressout.write_b('<p class="link">\n{}\n</p>\n'.format(
  183.         make_html_link(proto + '://' + host + request_uri)
  184.     ))
  185.     compressout.write_b('''
  186.         <p>The acronym WWW has some interesting properties</p>
  187.         <ul>
  188.             <li>
  189.                 It takes approximately as long to pronounce WWW as it takes
  190.                 to pronounce "world wide web".
  191.             </li>
  192.             <li>
  193.                 It looks like the skeletal formula for tridecane, but
  194.                 "tridecane" is definitively shorter when pronounced.
  195.             </li>
  196.             <li>
  197.                 It's used to make many URLs four bytes longer for no good
  198.                 reason.
  199.             </li>
  200.         </ul>
  201.         <p class="footer">
  202.             Page made by <a rel="nofollow"
  203. href="https://oskog97.com/read/?path=/tridecane/index.cgi&amp;referer=http://tridecane.oskog97.com/&amp;title=Back+to+the+tridecane+page"
  204.             >Tridecane</a>.
  205.         </p>
  206.     </body>
  207. </html>\n''')
  208. def main():
  209.     r'''
  210.     Handle requests to the tridecane/www subdomain.
  211.     
  212.     - /robots.txt is served properly.
  213.     - Certain URLs can be redirected.
  214.     - Static pages will have parameters stripped out.
  215.     - Dynamic pages will have the valid parameters sorted.
  216.     - Dynamic pages that don't use the usual format for the query
  217.       string are also supported.
  218.     
  219.     This function assumes `compressout.init` has already been called
  220.     and that `compressout.done` will be called after returning.
  221.     
  222.     The global variable `redirect` is a list of tuples of
  223.     (regex, replacement).  The replacement part follows the Python
  224.     regex syntax with \1 \2 ... as back-references.
  225.     
  226.     The global variable `parameters` is a dictionary where the keys
  227.     are the parts before '?' of the relative URLs to the dynamic pages.
  228.     The value is either '*' which means that the query string will be
  229.     untouched, or a list of strings where each string is a valid
  230.     parameter/variable. The parameters on the canonicalized relative
  231.     URL will come in the same order as specified in `parameters`.
  232.     
  233.     Misc global variables
  234.     ---------------------
  235.         
  236.         `host`          The hostname for the canonical site version.
  237.         
  238.         `proto`         'http' or 'https'
  239.         
  240.         `base_dir`      Relative URL without leading and trailing slash;
  241.                         where to find external files (images) for the
  242.                         generated pages.
  243.         
  244.         `fs_base_dir`   Absolute filesystem path without trailing slash
  245.                         to the directory `base_dir`.  (robots.txt and
  246.                         style.css are supposed to be there.)
  247.         
  248.         `referer_site`  For setting the Referer HTTP header when
  249.                         pulling in the title from the preferred site
  250.                         version.
  251.                         scheme://host (no trailing slash)
  252.                         host is the hostname for the tridecane site.
  253.     
  254.         `user_agent`    For setting the User-Agent HTTP header when
  255.                         pulling in the title from the preferred site
  256.                         version.
  257.         
  258.         `maxload`       See the docstring for `overloaded`.
  259.         
  260.     
  261.     '''
  262.     request_uri = os.getenv('REQUEST_URI')
  263.     #query_string = os.getenv('QUERY_STRING', '') # BUG
  264.     compressout.write_h('Cache-Control: max-age=1209600\n')
  265.     # /robots.txt
  266.     if request_uri == '/robots.txt':
  267.         try:
  268.             robots_txt = open(fs_base_dir + '/robots.txt').read()
  269.             compressout.write_h('Content-Type: text/plain\n\n')
  270.             compressout.write_b(robots_txt.read())
  271.         except IOError:
  272.             compressout.write_h('Status: 404\n\n')
  273.         return
  274.     # Deal with redirections.
  275.     for regex, replacement in redirect:
  276.         if re.match(regex, request_uri) is not None:
  277.             destination = re.sub(regex, replacement, request_uri)
  278.             compressout.write_h('Status: 301\n')
  279.             compressout.write_h(
  280.                 'Location: {proto}://{host}{destination}\n'.format(
  281.                     host=host, proto=proto, destination=destination
  282.                 )
  283.             )
  284.             compressout.write_h('\n')
  285.             return
  286.     # Automatically canonicalize parameters.
  287.     if '?' in request_uri:
  288.         cgi_name, query_string = request_uri.split('?', 1)
  289.         if cgi_name not in parameters:
  290.             # Should not have any parameters.
  291.             request_uri = cgi_name
  292.         elif parameters[cgi_name] == '*':
  293.             # Do not change the request_uri.
  294.             pass
  295.         else:
  296.             # Auto-correct request_uri.
  297.             valid_parameters = []
  298.             for valid in parameters[cgi_name]:
  299.                 if (valid + '=') in query_string:
  300.                     if query_string.startswith(valid + '='):
  301.                         value = query_string.split('=', 1)[1]
  302.                     else:
  303.                         value = query_string.split('&' + valid + '=')[1]
  304.                     value = value.split('&')[0]
  305.                     valid_parameters.append(valid + '=' + value)
  306.             request_uri = cgi_name + '?' + '&'.join(valid_parameters)
  307.     # Let `page` print the actual page.
  308.     page(request_uri)
  309.     
  310. def overloaded():
  311.     '''
  312.     Prevent over-revving the server.
  313.     
  314.     Returns True if a 503 page should be shown, and False if not.
  315.     
  316.     The global variable `maxload` is a dictionary containing:
  317.     
  318.         `load-avg-1`            Maximum average load during the last
  319.                                 minute.
  320.         
  321.         `throttle-file`         A file for recording the times of
  322.                                 the last `throttle-requests` requests.
  323.                                 Initial content SHOULD be '0\n', ie. a
  324.                                 zero.
  325.         
  326.         `throttle-requests`     The highest allowed number of requests
  327.                                 in `throttle-time` seconds.
  328.         
  329.         `throttle-time`         The shortest allowed time
  330.                                 `throttle-requests` requests are
  331.                                 allowed to be made.
  332.         
  333.         `retry-after`           Time in seconds for the Retry-After
  334.                                 HTTP header.
  335.         
  336.         `503-file`              The filesystem path to a static HTML
  337.                                 file with a Service Unavailable
  338.                                 message.
  339.                                 
  340.     '''
  341.     def status503():
  342.         compressout.write_h('Status: 503\n')
  343.         compressout.write_h('Content-Type: text/html; charset=UTF-8')
  344.         compressout.write_h('Retry-After: {}\n'.format(maxload['retry-after']))
  345.         compressout.write_h('\n')
  346.         compressout.write_b(open(maxload['503-file']).read())
  347.     
  348.     if os.getloadavg()[0] > maxload['load-avg-1']:
  349.         status503()
  350.         return True
  351.     try:
  352.         access_times = map(
  353.             float, open(maxload['throttle-file']).read().strip().split(':')
  354.         )
  355.     except:
  356.         access_times = [0]
  357.     if time.time() - access_times[-1] < maxload['throttle-time']:
  358.         status503()
  359.         return True
  360.     access_times.insert(0, time.time())
  361.     access_times = access_times[:maxload['throttle-requests']]
  362.     f = open(maxload['throttle-file'], 'w')
  363.     f.write(':'.join(map(str, access_times)) + '\n')
  364.     f.close()
  365. if __name__ == '__main__':
  366.     compressout.init()
  367.     if not overloaded():
  368.         main()
  369.     compressout.done()